1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 12 13; 14; add 15; 16 17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 18; SSE-LABEL: trunc_add_v4i64_v4i32: 19; SSE: # %bb.0: 20; SSE-NEXT: paddq %xmm3, %xmm1 21; SSE-NEXT: paddq %xmm2, %xmm0 22; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc_add_v4i64_v4i32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 28; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 29; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 30; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 31; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 32; AVX1-NEXT: vzeroupper 33; AVX1-NEXT: retq 34; 35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 36; AVX2-SLOW: # %bb.0: 37; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 38; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 39; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 40; AVX2-SLOW-NEXT: vzeroupper 41; AVX2-SLOW-NEXT: retq 42; 43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: 44; AVX2-FAST-ALL: # %bb.0: 45; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 46; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 47; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 48; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 49; AVX2-FAST-ALL-NEXT: vzeroupper 50; AVX2-FAST-ALL-NEXT: retq 51; 52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: 53; AVX2-FAST-PERLANE: # %bb.0: 54; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 55; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 56; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 57; AVX2-FAST-PERLANE-NEXT: vzeroupper 58; AVX2-FAST-PERLANE-NEXT: retq 59; 60; AVX512-LABEL: trunc_add_v4i64_v4i32: 61; AVX512: # %bb.0: 62; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 63; AVX512-NEXT: vpmovqd %zmm0, %ymm0 64; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 65; AVX512-NEXT: vzeroupper 66; AVX512-NEXT: retq 67 %1 = add <4 x i64> %a0, %a1 68 %2 = trunc <4 x i64> %1 to <4 x i32> 69 ret <4 x i32> %2 70} 71 72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 73; SSE-LABEL: trunc_add_v8i64_v8i16: 74; SSE: # %bb.0: 75; SSE-NEXT: paddq %xmm5, %xmm1 76; SSE-NEXT: paddq %xmm4, %xmm0 77; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 78; SSE-NEXT: paddq %xmm7, %xmm3 79; SSE-NEXT: paddq %xmm6, %xmm2 80; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 81; SSE-NEXT: pslld $16, %xmm2 82; SSE-NEXT: psrad $16, %xmm2 83; SSE-NEXT: pslld $16, %xmm0 84; SSE-NEXT: psrad $16, %xmm0 85; SSE-NEXT: packssdw %xmm2, %xmm0 86; SSE-NEXT: retq 87; 88; AVX1-LABEL: trunc_add_v8i64_v8i16: 89; AVX1: # %bb.0: 90; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 91; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 92; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 93; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 94; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 95; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 96; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 97; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 98; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 99; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 100; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 101; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 102; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 103; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 104; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 105; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 106; AVX1-NEXT: vzeroupper 107; AVX1-NEXT: retq 108; 109; AVX2-LABEL: trunc_add_v8i64_v8i16: 110; AVX2: # %bb.0: 111; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 112; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 113; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 114; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 115; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 116; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 117; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 118; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 119; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 120; AVX2-NEXT: vzeroupper 121; AVX2-NEXT: retq 122; 123; AVX512-LABEL: trunc_add_v8i64_v8i16: 124; AVX512: # %bb.0: 125; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 126; AVX512-NEXT: vpmovqw %zmm0, %xmm0 127; AVX512-NEXT: vzeroupper 128; AVX512-NEXT: retq 129 %1 = add <8 x i64> %a0, %a1 130 %2 = trunc <8 x i64> %1 to <8 x i16> 131 ret <8 x i16> %2 132} 133 134define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 135; SSE-LABEL: trunc_add_v8i32_v8i16: 136; SSE: # %bb.0: 137; SSE-NEXT: paddd %xmm2, %xmm0 138; SSE-NEXT: paddd %xmm3, %xmm1 139; SSE-NEXT: pslld $16, %xmm1 140; SSE-NEXT: psrad $16, %xmm1 141; SSE-NEXT: pslld $16, %xmm0 142; SSE-NEXT: psrad $16, %xmm0 143; SSE-NEXT: packssdw %xmm1, %xmm0 144; SSE-NEXT: retq 145; 146; AVX1-LABEL: trunc_add_v8i32_v8i16: 147; AVX1: # %bb.0: 148; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 149; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 150; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 151; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 152; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 153; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 154; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 155; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 156; AVX1-NEXT: vzeroupper 157; AVX1-NEXT: retq 158; 159; AVX2-LABEL: trunc_add_v8i32_v8i16: 160; AVX2: # %bb.0: 161; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 162; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 163; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 164; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 165; AVX2-NEXT: vzeroupper 166; AVX2-NEXT: retq 167; 168; AVX512-LABEL: trunc_add_v8i32_v8i16: 169; AVX512: # %bb.0: 170; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 171; AVX512-NEXT: vpmovdw %zmm0, %ymm0 172; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 173; AVX512-NEXT: vzeroupper 174; AVX512-NEXT: retq 175 %1 = add <8 x i32> %a0, %a1 176 %2 = trunc <8 x i32> %1 to <8 x i16> 177 ret <8 x i16> %2 178} 179 180define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 181; SSE-LABEL: trunc_add_v16i64_v16i8: 182; SSE: # %bb.0: 183; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 184; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 185; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 186; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 187; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 188; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 189; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 190; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 191; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 192; SSE-NEXT: pand %xmm8, %xmm7 193; SSE-NEXT: pand %xmm8, %xmm6 194; SSE-NEXT: packuswb %xmm7, %xmm6 195; SSE-NEXT: pand %xmm8, %xmm5 196; SSE-NEXT: pand %xmm8, %xmm4 197; SSE-NEXT: packuswb %xmm5, %xmm4 198; SSE-NEXT: packuswb %xmm6, %xmm4 199; SSE-NEXT: pand %xmm8, %xmm3 200; SSE-NEXT: pand %xmm8, %xmm2 201; SSE-NEXT: packuswb %xmm3, %xmm2 202; SSE-NEXT: pand %xmm8, %xmm1 203; SSE-NEXT: pand %xmm8, %xmm0 204; SSE-NEXT: packuswb %xmm1, %xmm0 205; SSE-NEXT: packuswb %xmm2, %xmm0 206; SSE-NEXT: packuswb %xmm4, %xmm0 207; SSE-NEXT: retq 208; 209; AVX1-LABEL: trunc_add_v16i64_v16i8: 210; AVX1: # %bb.0: 211; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 212; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 213; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 214; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 215; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 216; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 217; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 218; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 219; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 220; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 221; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 222; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 223; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 224; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 225; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 226; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 227; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] 228; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 229; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 230; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 231; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 232; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 233; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 234; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 235; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 236; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 237; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 238; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 239; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 240; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 241; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 242; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 243; AVX1-NEXT: vzeroupper 244; AVX1-NEXT: retq 245; 246; AVX2-LABEL: trunc_add_v16i64_v16i8: 247; AVX2: # %bb.0: 248; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 249; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 250; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 251; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 252; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 253; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 254; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 255; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 256; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 257; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 258; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 259; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 260; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 261; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 262; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 263; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 264; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 265; AVX2-NEXT: vzeroupper 266; AVX2-NEXT: retq 267; 268; AVX512-LABEL: trunc_add_v16i64_v16i8: 269; AVX512: # %bb.0: 270; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 271; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 272; AVX512-NEXT: vpmovqb %zmm1, %xmm1 273; AVX512-NEXT: vpmovqb %zmm0, %xmm0 274; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 275; AVX512-NEXT: vzeroupper 276; AVX512-NEXT: retq 277 %1 = add <16 x i64> %a0, %a1 278 %2 = trunc <16 x i64> %1 to <16 x i8> 279 ret <16 x i8> %2 280} 281 282define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 283; SSE-LABEL: trunc_add_v16i32_v16i8: 284; SSE: # %bb.0: 285; SSE-NEXT: paddd %xmm4, %xmm0 286; SSE-NEXT: paddd %xmm5, %xmm1 287; SSE-NEXT: paddd %xmm6, %xmm2 288; SSE-NEXT: paddd %xmm7, %xmm3 289; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 290; SSE-NEXT: pand %xmm4, %xmm3 291; SSE-NEXT: pand %xmm4, %xmm2 292; SSE-NEXT: packuswb %xmm3, %xmm2 293; SSE-NEXT: pand %xmm4, %xmm1 294; SSE-NEXT: pand %xmm4, %xmm0 295; SSE-NEXT: packuswb %xmm1, %xmm0 296; SSE-NEXT: packuswb %xmm2, %xmm0 297; SSE-NEXT: retq 298; 299; AVX1-LABEL: trunc_add_v16i32_v16i8: 300; AVX1: # %bb.0: 301; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 302; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 303; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 304; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 305; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 306; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 307; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 308; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 309; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] 310; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 311; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 312; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 313; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 314; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 315; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 316; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 317; AVX1-NEXT: vzeroupper 318; AVX1-NEXT: retq 319; 320; AVX2-LABEL: trunc_add_v16i32_v16i8: 321; AVX2: # %bb.0: 322; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 323; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 324; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 325; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 326; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 327; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 328; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 329; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 330; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 331; AVX2-NEXT: vzeroupper 332; AVX2-NEXT: retq 333; 334; AVX512-LABEL: trunc_add_v16i32_v16i8: 335; AVX512: # %bb.0: 336; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 337; AVX512-NEXT: vpmovdb %zmm0, %xmm0 338; AVX512-NEXT: vzeroupper 339; AVX512-NEXT: retq 340 %1 = add <16 x i32> %a0, %a1 341 %2 = trunc <16 x i32> %1 to <16 x i8> 342 ret <16 x i8> %2 343} 344 345define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 346; SSE-LABEL: trunc_add_v16i16_v16i8: 347; SSE: # %bb.0: 348; SSE-NEXT: paddw %xmm2, %xmm0 349; SSE-NEXT: paddw %xmm3, %xmm1 350; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 351; SSE-NEXT: pand %xmm2, %xmm1 352; SSE-NEXT: pand %xmm2, %xmm0 353; SSE-NEXT: packuswb %xmm1, %xmm0 354; SSE-NEXT: retq 355; 356; AVX1-LABEL: trunc_add_v16i16_v16i8: 357; AVX1: # %bb.0: 358; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 359; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 360; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 361; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 362; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 363; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 364; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 365; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 366; AVX1-NEXT: vzeroupper 367; AVX1-NEXT: retq 368; 369; AVX2-LABEL: trunc_add_v16i16_v16i8: 370; AVX2: # %bb.0: 371; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 372; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 373; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 374; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 375; AVX2-NEXT: vzeroupper 376; AVX2-NEXT: retq 377; 378; AVX512F-LABEL: trunc_add_v16i16_v16i8: 379; AVX512F: # %bb.0: 380; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 381; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 382; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 383; AVX512F-NEXT: vzeroupper 384; AVX512F-NEXT: retq 385; 386; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 387; AVX512BW: # %bb.0: 388; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 389; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 390; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 391; AVX512BW-NEXT: vzeroupper 392; AVX512BW-NEXT: retq 393; 394; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 395; AVX512DQ: # %bb.0: 396; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 397; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 398; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 399; AVX512DQ-NEXT: vzeroupper 400; AVX512DQ-NEXT: retq 401 %1 = add <16 x i16> %a0, %a1 402 %2 = trunc <16 x i16> %1 to <16 x i8> 403 ret <16 x i8> %2 404} 405 406define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 407; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 408; SSE: # %bb.0: 409; SSE-NEXT: pslld $16, %xmm2 410; SSE-NEXT: psrad $16, %xmm2 411; SSE-NEXT: pslld $16, %xmm1 412; SSE-NEXT: psrad $16, %xmm1 413; SSE-NEXT: packssdw %xmm2, %xmm1 414; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 415; SSE-NEXT: psraw $8, %xmm0 416; SSE-NEXT: paddw %xmm1, %xmm0 417; SSE-NEXT: retq 418; 419; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 420; AVX1: # %bb.0: 421; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 422; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 423; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 424; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 425; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 426; AVX1-NEXT: vzeroupper 427; AVX1-NEXT: retq 428; 429; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 430; AVX2: # %bb.0: 431; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 432; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 433; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 434; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 435; AVX2-NEXT: vzeroupper 436; AVX2-NEXT: retq 437; 438; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 439; AVX512: # %bb.0: 440; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 441; AVX512-NEXT: vpmovdw %zmm1, %ymm1 442; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 443; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 444; AVX512-NEXT: vzeroupper 445; AVX512-NEXT: retq 446 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 447 %2 = sext <8 x i8> %1 to <8 x i32> 448 %3 = add <8 x i32> %2, %a1 449 %4 = trunc <8 x i32> %3 to <8 x i16> 450 ret <8 x i16> %4 451} 452 453; 454; add to constant 455; 456 457define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 458; SSE-LABEL: trunc_add_const_v4i64_v4i32: 459; SSE: # %bb.0: 460; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 461; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSE-NEXT: retq 463; 464; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 465; AVX1: # %bb.0: 466; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 467; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 468; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 469; AVX1-NEXT: vzeroupper 470; AVX1-NEXT: retq 471; 472; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 473; AVX2-SLOW: # %bb.0: 474; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 475; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 476; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 477; AVX2-SLOW-NEXT: vzeroupper 478; AVX2-SLOW-NEXT: retq 479; 480; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: 481; AVX2-FAST-ALL: # %bb.0: 482; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 483; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 484; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 485; AVX2-FAST-ALL-NEXT: vzeroupper 486; AVX2-FAST-ALL-NEXT: retq 487; 488; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: 489; AVX2-FAST-PERLANE: # %bb.0: 490; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 491; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 492; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 493; AVX2-FAST-PERLANE-NEXT: vzeroupper 494; AVX2-FAST-PERLANE-NEXT: retq 495; 496; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 497; AVX512: # %bb.0: 498; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 499; AVX512-NEXT: vpmovqd %zmm0, %ymm0 500; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 501; AVX512-NEXT: vzeroupper 502; AVX512-NEXT: retq 503 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 504 %2 = trunc <4 x i64> %1 to <4 x i32> 505 ret <4 x i32> %2 506} 507 508define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 509; SSE-LABEL: trunc_add_const_v8i64_v8i16: 510; SSE: # %bb.0: 511; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 512; SSE-NEXT: pslld $16, %xmm2 513; SSE-NEXT: psrad $16, %xmm2 514; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 515; SSE-NEXT: pslld $16, %xmm0 516; SSE-NEXT: psrad $16, %xmm0 517; SSE-NEXT: packssdw %xmm2, %xmm0 518; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 519; SSE-NEXT: retq 520; 521; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 522; AVX1: # %bb.0: 523; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 524; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 525; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 526; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 527; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 528; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 529; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 530; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 531; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 532; AVX1-NEXT: vzeroupper 533; AVX1-NEXT: retq 534; 535; AVX2-LABEL: trunc_add_const_v8i64_v8i16: 536; AVX2: # %bb.0: 537; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 538; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 539; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 540; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 541; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 542; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 543; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 544; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 545; AVX2-NEXT: vzeroupper 546; AVX2-NEXT: retq 547; 548; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 549; AVX512: # %bb.0: 550; AVX512-NEXT: vpmovqw %zmm0, %xmm0 551; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 552; AVX512-NEXT: vzeroupper 553; AVX512-NEXT: retq 554 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 555 %2 = trunc <8 x i64> %1 to <8 x i16> 556 ret <8 x i16> %2 557} 558 559define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 560; SSE-LABEL: trunc_add_const_v8i32_v8i16: 561; SSE: # %bb.0: 562; SSE-NEXT: pslld $16, %xmm1 563; SSE-NEXT: psrad $16, %xmm1 564; SSE-NEXT: pslld $16, %xmm0 565; SSE-NEXT: psrad $16, %xmm0 566; SSE-NEXT: packssdw %xmm1, %xmm0 567; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 568; SSE-NEXT: retq 569; 570; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 571; AVX1: # %bb.0: 572; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 573; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 574; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 575; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 576; AVX1-NEXT: vzeroupper 577; AVX1-NEXT: retq 578; 579; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 580; AVX2: # %bb.0: 581; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 582; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 583; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 584; AVX2-NEXT: vzeroupper 585; AVX2-NEXT: retq 586; 587; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 588; AVX512: # %bb.0: 589; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 590; AVX512-NEXT: vpmovdw %zmm0, %ymm0 591; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 592; AVX512-NEXT: vzeroupper 593; AVX512-NEXT: retq 594 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 595 %2 = trunc <8 x i32> %1 to <8 x i16> 596 ret <8 x i16> %2 597} 598 599define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 600; SSE-LABEL: trunc_add_const_v16i64_v16i8: 601; SSE: # %bb.0: 602; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 603; SSE-NEXT: pand %xmm8, %xmm7 604; SSE-NEXT: pand %xmm8, %xmm6 605; SSE-NEXT: packuswb %xmm7, %xmm6 606; SSE-NEXT: pand %xmm8, %xmm5 607; SSE-NEXT: pand %xmm8, %xmm4 608; SSE-NEXT: packuswb %xmm5, %xmm4 609; SSE-NEXT: packuswb %xmm6, %xmm4 610; SSE-NEXT: pand %xmm8, %xmm3 611; SSE-NEXT: pand %xmm8, %xmm2 612; SSE-NEXT: packuswb %xmm3, %xmm2 613; SSE-NEXT: pand %xmm8, %xmm1 614; SSE-NEXT: pand %xmm8, %xmm0 615; SSE-NEXT: packuswb %xmm1, %xmm0 616; SSE-NEXT: packuswb %xmm2, %xmm0 617; SSE-NEXT: packuswb %xmm4, %xmm0 618; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 619; SSE-NEXT: retq 620; 621; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 622; AVX1: # %bb.0: 623; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 624; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 625; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 626; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 627; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 628; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 629; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 630; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 631; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 632; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 633; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 634; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 635; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 636; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 637; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 638; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 639; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 640; AVX1-NEXT: vzeroupper 641; AVX1-NEXT: retq 642; 643; AVX2-LABEL: trunc_add_const_v16i64_v16i8: 644; AVX2: # %bb.0: 645; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 646; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 647; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 648; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 649; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 650; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 651; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 652; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 653; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 654; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 655; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 656; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 657; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 658; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 659; AVX2-NEXT: vzeroupper 660; AVX2-NEXT: retq 661; 662; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 663; AVX512: # %bb.0: 664; AVX512-NEXT: vpmovqb %zmm1, %xmm1 665; AVX512-NEXT: vpmovqb %zmm0, %xmm0 666; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 667; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 668; AVX512-NEXT: vzeroupper 669; AVX512-NEXT: retq 670 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 671 %2 = trunc <16 x i64> %1 to <16 x i8> 672 ret <16 x i8> %2 673} 674 675define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 676; SSE-LABEL: trunc_add_const_v16i32_v16i8: 677; SSE: # %bb.0: 678; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 679; SSE-NEXT: pand %xmm4, %xmm3 680; SSE-NEXT: pand %xmm4, %xmm2 681; SSE-NEXT: packuswb %xmm3, %xmm2 682; SSE-NEXT: pand %xmm4, %xmm1 683; SSE-NEXT: pand %xmm4, %xmm0 684; SSE-NEXT: packuswb %xmm1, %xmm0 685; SSE-NEXT: packuswb %xmm2, %xmm0 686; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 687; SSE-NEXT: retq 688; 689; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 690; AVX1: # %bb.0: 691; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 692; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 693; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 694; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 695; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 696; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 697; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 698; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 699; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 700; AVX1-NEXT: vzeroupper 701; AVX1-NEXT: retq 702; 703; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 704; AVX2: # %bb.0: 705; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 706; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 707; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 708; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 709; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 710; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 711; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 712; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 713; AVX2-NEXT: vzeroupper 714; AVX2-NEXT: retq 715; 716; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 717; AVX512: # %bb.0: 718; AVX512-NEXT: vpmovdb %zmm0, %xmm0 719; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 720; AVX512-NEXT: vzeroupper 721; AVX512-NEXT: retq 722 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 723 %2 = trunc <16 x i32> %1 to <16 x i8> 724 ret <16 x i8> %2 725} 726 727define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 728; SSE-LABEL: trunc_add_const_v16i16_v16i8: 729; SSE: # %bb.0: 730; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 731; SSE-NEXT: pand %xmm2, %xmm1 732; SSE-NEXT: pand %xmm2, %xmm0 733; SSE-NEXT: packuswb %xmm1, %xmm0 734; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 735; SSE-NEXT: retq 736; 737; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 738; AVX1: # %bb.0: 739; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 740; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 741; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 742; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 743; AVX1-NEXT: vzeroupper 744; AVX1-NEXT: retq 745; 746; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 747; AVX2: # %bb.0: 748; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 749; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 750; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 751; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 752; AVX2-NEXT: vzeroupper 753; AVX2-NEXT: retq 754; 755; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 756; AVX512F: # %bb.0: 757; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 758; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 759; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 760; AVX512F-NEXT: vzeroupper 761; AVX512F-NEXT: retq 762; 763; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 764; AVX512BW: # %bb.0: 765; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 766; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 767; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 768; AVX512BW-NEXT: vzeroupper 769; AVX512BW-NEXT: retq 770; 771; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 772; AVX512DQ: # %bb.0: 773; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 774; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 775; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 776; AVX512DQ-NEXT: vzeroupper 777; AVX512DQ-NEXT: retq 778 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 779 %2 = trunc <16 x i16> %1 to <16 x i8> 780 ret <16 x i8> %2 781} 782 783; 784; sub 785; 786 787define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 788; SSE-LABEL: trunc_sub_v4i64_v4i32: 789; SSE: # %bb.0: 790; SSE-NEXT: psubq %xmm3, %xmm1 791; SSE-NEXT: psubq %xmm2, %xmm0 792; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 793; SSE-NEXT: retq 794; 795; AVX1-LABEL: trunc_sub_v4i64_v4i32: 796; AVX1: # %bb.0: 797; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 798; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 799; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 800; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 801; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 802; AVX1-NEXT: vzeroupper 803; AVX1-NEXT: retq 804; 805; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 806; AVX2-SLOW: # %bb.0: 807; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 808; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 809; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 810; AVX2-SLOW-NEXT: vzeroupper 811; AVX2-SLOW-NEXT: retq 812; 813; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: 814; AVX2-FAST-ALL: # %bb.0: 815; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 816; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 817; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 818; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 819; AVX2-FAST-ALL-NEXT: vzeroupper 820; AVX2-FAST-ALL-NEXT: retq 821; 822; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: 823; AVX2-FAST-PERLANE: # %bb.0: 824; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 825; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 826; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 827; AVX2-FAST-PERLANE-NEXT: vzeroupper 828; AVX2-FAST-PERLANE-NEXT: retq 829; 830; AVX512-LABEL: trunc_sub_v4i64_v4i32: 831; AVX512: # %bb.0: 832; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 833; AVX512-NEXT: vpmovqd %zmm0, %ymm0 834; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 835; AVX512-NEXT: vzeroupper 836; AVX512-NEXT: retq 837 %1 = sub <4 x i64> %a0, %a1 838 %2 = trunc <4 x i64> %1 to <4 x i32> 839 ret <4 x i32> %2 840} 841 842define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 843; SSE-LABEL: trunc_sub_v8i64_v8i16: 844; SSE: # %bb.0: 845; SSE-NEXT: psubq %xmm5, %xmm1 846; SSE-NEXT: psubq %xmm4, %xmm0 847; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 848; SSE-NEXT: psubq %xmm7, %xmm3 849; SSE-NEXT: psubq %xmm6, %xmm2 850; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 851; SSE-NEXT: pslld $16, %xmm2 852; SSE-NEXT: psrad $16, %xmm2 853; SSE-NEXT: pslld $16, %xmm0 854; SSE-NEXT: psrad $16, %xmm0 855; SSE-NEXT: packssdw %xmm2, %xmm0 856; SSE-NEXT: retq 857; 858; AVX1-LABEL: trunc_sub_v8i64_v8i16: 859; AVX1: # %bb.0: 860; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 861; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 862; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 863; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 864; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 865; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 866; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 867; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 868; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 869; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 870; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 871; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 872; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 873; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 874; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 875; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 876; AVX1-NEXT: vzeroupper 877; AVX1-NEXT: retq 878; 879; AVX2-LABEL: trunc_sub_v8i64_v8i16: 880; AVX2: # %bb.0: 881; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 882; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 883; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 884; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 885; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 886; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 887; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 888; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 889; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 890; AVX2-NEXT: vzeroupper 891; AVX2-NEXT: retq 892; 893; AVX512-LABEL: trunc_sub_v8i64_v8i16: 894; AVX512: # %bb.0: 895; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 896; AVX512-NEXT: vpmovqw %zmm0, %xmm0 897; AVX512-NEXT: vzeroupper 898; AVX512-NEXT: retq 899 %1 = sub <8 x i64> %a0, %a1 900 %2 = trunc <8 x i64> %1 to <8 x i16> 901 ret <8 x i16> %2 902} 903 904define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 905; SSE-LABEL: trunc_sub_v8i32_v8i16: 906; SSE: # %bb.0: 907; SSE-NEXT: psubd %xmm2, %xmm0 908; SSE-NEXT: psubd %xmm3, %xmm1 909; SSE-NEXT: pslld $16, %xmm1 910; SSE-NEXT: psrad $16, %xmm1 911; SSE-NEXT: pslld $16, %xmm0 912; SSE-NEXT: psrad $16, %xmm0 913; SSE-NEXT: packssdw %xmm1, %xmm0 914; SSE-NEXT: retq 915; 916; AVX1-LABEL: trunc_sub_v8i32_v8i16: 917; AVX1: # %bb.0: 918; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 919; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 920; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 921; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 922; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 923; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 924; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 925; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 926; AVX1-NEXT: vzeroupper 927; AVX1-NEXT: retq 928; 929; AVX2-LABEL: trunc_sub_v8i32_v8i16: 930; AVX2: # %bb.0: 931; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 932; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 933; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 934; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 935; AVX2-NEXT: vzeroupper 936; AVX2-NEXT: retq 937; 938; AVX512-LABEL: trunc_sub_v8i32_v8i16: 939; AVX512: # %bb.0: 940; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 941; AVX512-NEXT: vpmovdw %zmm0, %ymm0 942; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 943; AVX512-NEXT: vzeroupper 944; AVX512-NEXT: retq 945 %1 = sub <8 x i32> %a0, %a1 946 %2 = trunc <8 x i32> %1 to <8 x i16> 947 ret <8 x i16> %2 948} 949 950define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 951; SSE-LABEL: trunc_sub_v16i64_v16i8: 952; SSE: # %bb.0: 953; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 954; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 955; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 956; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 957; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 958; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 959; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 960; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 961; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 962; SSE-NEXT: pand %xmm8, %xmm7 963; SSE-NEXT: pand %xmm8, %xmm6 964; SSE-NEXT: packuswb %xmm7, %xmm6 965; SSE-NEXT: pand %xmm8, %xmm5 966; SSE-NEXT: pand %xmm8, %xmm4 967; SSE-NEXT: packuswb %xmm5, %xmm4 968; SSE-NEXT: packuswb %xmm6, %xmm4 969; SSE-NEXT: pand %xmm8, %xmm3 970; SSE-NEXT: pand %xmm8, %xmm2 971; SSE-NEXT: packuswb %xmm3, %xmm2 972; SSE-NEXT: pand %xmm8, %xmm1 973; SSE-NEXT: pand %xmm8, %xmm0 974; SSE-NEXT: packuswb %xmm1, %xmm0 975; SSE-NEXT: packuswb %xmm2, %xmm0 976; SSE-NEXT: packuswb %xmm4, %xmm0 977; SSE-NEXT: retq 978; 979; AVX1-LABEL: trunc_sub_v16i64_v16i8: 980; AVX1: # %bb.0: 981; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 982; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 983; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 984; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 985; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 986; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 987; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 988; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 989; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 990; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 991; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 992; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 993; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 994; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 995; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 996; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 997; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] 998; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 999; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1000; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1001; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1002; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1003; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1004; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1005; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1006; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1007; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1008; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1009; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1010; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1011; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1012; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1013; AVX1-NEXT: vzeroupper 1014; AVX1-NEXT: retq 1015; 1016; AVX2-LABEL: trunc_sub_v16i64_v16i8: 1017; AVX2: # %bb.0: 1018; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1019; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1020; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1021; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1022; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1023; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1024; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1025; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1026; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1027; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1028; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1029; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1030; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1031; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1032; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1033; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1034; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1035; AVX2-NEXT: vzeroupper 1036; AVX2-NEXT: retq 1037; 1038; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1039; AVX512: # %bb.0: 1040; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1041; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1042; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1043; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1044; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1045; AVX512-NEXT: vzeroupper 1046; AVX512-NEXT: retq 1047 %1 = sub <16 x i64> %a0, %a1 1048 %2 = trunc <16 x i64> %1 to <16 x i8> 1049 ret <16 x i8> %2 1050} 1051 1052define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1053; SSE-LABEL: trunc_sub_v16i32_v16i8: 1054; SSE: # %bb.0: 1055; SSE-NEXT: psubd %xmm4, %xmm0 1056; SSE-NEXT: psubd %xmm5, %xmm1 1057; SSE-NEXT: psubd %xmm6, %xmm2 1058; SSE-NEXT: psubd %xmm7, %xmm3 1059; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1060; SSE-NEXT: pand %xmm4, %xmm3 1061; SSE-NEXT: pand %xmm4, %xmm2 1062; SSE-NEXT: packuswb %xmm3, %xmm2 1063; SSE-NEXT: pand %xmm4, %xmm1 1064; SSE-NEXT: pand %xmm4, %xmm0 1065; SSE-NEXT: packuswb %xmm1, %xmm0 1066; SSE-NEXT: packuswb %xmm2, %xmm0 1067; SSE-NEXT: retq 1068; 1069; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1070; AVX1: # %bb.0: 1071; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1072; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1073; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1074; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1075; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1076; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1077; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1078; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1079; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] 1080; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1081; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1082; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1083; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1084; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1085; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1086; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1087; AVX1-NEXT: vzeroupper 1088; AVX1-NEXT: retq 1089; 1090; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1091; AVX2: # %bb.0: 1092; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1093; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1094; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1095; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1096; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1097; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1098; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1099; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1100; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1101; AVX2-NEXT: vzeroupper 1102; AVX2-NEXT: retq 1103; 1104; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1105; AVX512: # %bb.0: 1106; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1107; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1108; AVX512-NEXT: vzeroupper 1109; AVX512-NEXT: retq 1110 %1 = sub <16 x i32> %a0, %a1 1111 %2 = trunc <16 x i32> %1 to <16 x i8> 1112 ret <16 x i8> %2 1113} 1114 1115define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1116; SSE-LABEL: trunc_sub_v16i16_v16i8: 1117; SSE: # %bb.0: 1118; SSE-NEXT: psubw %xmm2, %xmm0 1119; SSE-NEXT: psubw %xmm3, %xmm1 1120; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1121; SSE-NEXT: pand %xmm2, %xmm1 1122; SSE-NEXT: pand %xmm2, %xmm0 1123; SSE-NEXT: packuswb %xmm1, %xmm0 1124; SSE-NEXT: retq 1125; 1126; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1127; AVX1: # %bb.0: 1128; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1129; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1130; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1131; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1132; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1133; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1134; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1135; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1136; AVX1-NEXT: vzeroupper 1137; AVX1-NEXT: retq 1138; 1139; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1140; AVX2: # %bb.0: 1141; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1142; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1143; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1144; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1145; AVX2-NEXT: vzeroupper 1146; AVX2-NEXT: retq 1147; 1148; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1149; AVX512F: # %bb.0: 1150; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1151; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1152; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1153; AVX512F-NEXT: vzeroupper 1154; AVX512F-NEXT: retq 1155; 1156; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1157; AVX512BW: # %bb.0: 1158; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1159; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1160; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1161; AVX512BW-NEXT: vzeroupper 1162; AVX512BW-NEXT: retq 1163; 1164; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1165; AVX512DQ: # %bb.0: 1166; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1167; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1168; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1169; AVX512DQ-NEXT: vzeroupper 1170; AVX512DQ-NEXT: retq 1171 %1 = sub <16 x i16> %a0, %a1 1172 %2 = trunc <16 x i16> %1 to <16 x i8> 1173 ret <16 x i8> %2 1174} 1175 1176define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { 1177; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: 1178; SSE: # %bb.0: 1179; SSE-NEXT: psubb %xmm1, %xmm0 1180; SSE-NEXT: retq 1181; 1182; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: 1183; AVX: # %bb.0: 1184; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1185; AVX-NEXT: retq 1186 %a = zext <16 x i8> %x to <16 x i16> 1187 %b = zext <16 x i8> %y to <16 x i16> 1188 %c = sub <16 x i16> %a, %b 1189 %d = trunc <16 x i16> %c to <16 x i8> 1190 ret <16 x i8> %d 1191} 1192 1193; 1194; sub to constant 1195; 1196 1197define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1198; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1199; SSE: # %bb.0: 1200; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1201; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1202; SSE-NEXT: retq 1203; 1204; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1205; AVX1: # %bb.0: 1206; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1207; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1208; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1209; AVX1-NEXT: vzeroupper 1210; AVX1-NEXT: retq 1211; 1212; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1213; AVX2-SLOW: # %bb.0: 1214; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1215; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1216; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1217; AVX2-SLOW-NEXT: vzeroupper 1218; AVX2-SLOW-NEXT: retq 1219; 1220; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: 1221; AVX2-FAST-ALL: # %bb.0: 1222; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 1223; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 1224; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1225; AVX2-FAST-ALL-NEXT: vzeroupper 1226; AVX2-FAST-ALL-NEXT: retq 1227; 1228; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: 1229; AVX2-FAST-PERLANE: # %bb.0: 1230; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 1231; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1232; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1233; AVX2-FAST-PERLANE-NEXT: vzeroupper 1234; AVX2-FAST-PERLANE-NEXT: retq 1235; 1236; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1237; AVX512: # %bb.0: 1238; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1239; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1240; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1241; AVX512-NEXT: vzeroupper 1242; AVX512-NEXT: retq 1243 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1244 %2 = trunc <4 x i64> %1 to <4 x i32> 1245 ret <4 x i32> %2 1246} 1247 1248define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1249; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1250; SSE: # %bb.0: 1251; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1252; SSE-NEXT: pslld $16, %xmm2 1253; SSE-NEXT: psrad $16, %xmm2 1254; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1255; SSE-NEXT: pslld $16, %xmm0 1256; SSE-NEXT: psrad $16, %xmm0 1257; SSE-NEXT: packssdw %xmm2, %xmm0 1258; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1259; SSE-NEXT: retq 1260; 1261; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1262; AVX1: # %bb.0: 1263; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 1264; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1265; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1266; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1267; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1268; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1269; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1270; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1271; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1272; AVX1-NEXT: vzeroupper 1273; AVX1-NEXT: retq 1274; 1275; AVX2-LABEL: trunc_sub_const_v8i64_v8i16: 1276; AVX2: # %bb.0: 1277; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1278; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1279; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1280; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1281; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1282; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1283; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1284; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1285; AVX2-NEXT: vzeroupper 1286; AVX2-NEXT: retq 1287; 1288; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1289; AVX512: # %bb.0: 1290; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1291; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1292; AVX512-NEXT: vzeroupper 1293; AVX512-NEXT: retq 1294 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1295 %2 = trunc <8 x i64> %1 to <8 x i16> 1296 ret <8 x i16> %2 1297} 1298 1299define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1300; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1301; SSE: # %bb.0: 1302; SSE-NEXT: pslld $16, %xmm1 1303; SSE-NEXT: psrad $16, %xmm1 1304; SSE-NEXT: pslld $16, %xmm0 1305; SSE-NEXT: psrad $16, %xmm0 1306; SSE-NEXT: packssdw %xmm1, %xmm0 1307; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1308; SSE-NEXT: retq 1309; 1310; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1311; AVX1: # %bb.0: 1312; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1313; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1314; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1315; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1316; AVX1-NEXT: vzeroupper 1317; AVX1-NEXT: retq 1318; 1319; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1320; AVX2: # %bb.0: 1321; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1322; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1323; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1324; AVX2-NEXT: vzeroupper 1325; AVX2-NEXT: retq 1326; 1327; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1328; AVX512: # %bb.0: 1329; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1330; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1331; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1332; AVX512-NEXT: vzeroupper 1333; AVX512-NEXT: retq 1334 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1335 %2 = trunc <8 x i32> %1 to <8 x i16> 1336 ret <8 x i16> %2 1337} 1338 1339define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1340; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1341; SSE: # %bb.0: 1342; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1343; SSE-NEXT: pand %xmm8, %xmm7 1344; SSE-NEXT: pand %xmm8, %xmm6 1345; SSE-NEXT: packuswb %xmm7, %xmm6 1346; SSE-NEXT: pand %xmm8, %xmm5 1347; SSE-NEXT: pand %xmm8, %xmm4 1348; SSE-NEXT: packuswb %xmm5, %xmm4 1349; SSE-NEXT: packuswb %xmm6, %xmm4 1350; SSE-NEXT: pand %xmm8, %xmm3 1351; SSE-NEXT: pand %xmm8, %xmm2 1352; SSE-NEXT: packuswb %xmm3, %xmm2 1353; SSE-NEXT: pand %xmm8, %xmm1 1354; SSE-NEXT: pand %xmm8, %xmm0 1355; SSE-NEXT: packuswb %xmm1, %xmm0 1356; SSE-NEXT: packuswb %xmm2, %xmm0 1357; SSE-NEXT: packuswb %xmm4, %xmm0 1358; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1359; SSE-NEXT: retq 1360; 1361; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1362; AVX1: # %bb.0: 1363; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 1364; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1365; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1366; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1367; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1368; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1369; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1370; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1371; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1372; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1373; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1374; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1375; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1376; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1377; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1378; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1379; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1380; AVX1-NEXT: vzeroupper 1381; AVX1-NEXT: retq 1382; 1383; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: 1384; AVX2: # %bb.0: 1385; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1386; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1387; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1388; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1389; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1390; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1391; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1392; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1393; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1394; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1395; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1396; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1397; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1398; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1399; AVX2-NEXT: vzeroupper 1400; AVX2-NEXT: retq 1401; 1402; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1403; AVX512: # %bb.0: 1404; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1405; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1406; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1407; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1408; AVX512-NEXT: vzeroupper 1409; AVX512-NEXT: retq 1410 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1411 %2 = trunc <16 x i64> %1 to <16 x i8> 1412 ret <16 x i8> %2 1413} 1414 1415define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1416; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1417; SSE: # %bb.0: 1418; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1419; SSE-NEXT: pand %xmm4, %xmm3 1420; SSE-NEXT: pand %xmm4, %xmm2 1421; SSE-NEXT: packuswb %xmm3, %xmm2 1422; SSE-NEXT: pand %xmm4, %xmm1 1423; SSE-NEXT: pand %xmm4, %xmm0 1424; SSE-NEXT: packuswb %xmm1, %xmm0 1425; SSE-NEXT: packuswb %xmm2, %xmm0 1426; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1427; SSE-NEXT: retq 1428; 1429; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1430; AVX1: # %bb.0: 1431; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1432; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1433; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1434; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1435; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1436; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1437; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1438; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1439; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1440; AVX1-NEXT: vzeroupper 1441; AVX1-NEXT: retq 1442; 1443; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1444; AVX2: # %bb.0: 1445; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1446; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1447; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1448; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1449; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1450; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1451; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1452; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1453; AVX2-NEXT: vzeroupper 1454; AVX2-NEXT: retq 1455; 1456; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1457; AVX512: # %bb.0: 1458; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1459; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1460; AVX512-NEXT: vzeroupper 1461; AVX512-NEXT: retq 1462 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1463 %2 = trunc <16 x i32> %1 to <16 x i8> 1464 ret <16 x i8> %2 1465} 1466 1467define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1468; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1469; SSE: # %bb.0: 1470; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1471; SSE-NEXT: pand %xmm2, %xmm1 1472; SSE-NEXT: pand %xmm2, %xmm0 1473; SSE-NEXT: packuswb %xmm1, %xmm0 1474; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1475; SSE-NEXT: retq 1476; 1477; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1478; AVX1: # %bb.0: 1479; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1480; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1481; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1482; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1483; AVX1-NEXT: vzeroupper 1484; AVX1-NEXT: retq 1485; 1486; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1487; AVX2: # %bb.0: 1488; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1489; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1490; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1491; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1492; AVX2-NEXT: vzeroupper 1493; AVX2-NEXT: retq 1494; 1495; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1496; AVX512F: # %bb.0: 1497; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1498; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1499; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1500; AVX512F-NEXT: vzeroupper 1501; AVX512F-NEXT: retq 1502; 1503; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1504; AVX512BW: # %bb.0: 1505; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1506; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1507; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1508; AVX512BW-NEXT: vzeroupper 1509; AVX512BW-NEXT: retq 1510; 1511; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1512; AVX512DQ: # %bb.0: 1513; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1514; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1515; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1516; AVX512DQ-NEXT: vzeroupper 1517; AVX512DQ-NEXT: retq 1518 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1519 %2 = trunc <16 x i16> %1 to <16 x i8> 1520 ret <16 x i8> %2 1521} 1522 1523define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { 1524; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1525; SSE: # %bb.0: 1526; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1527; SSE-NEXT: retq 1528; 1529; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1530; AVX: # %bb.0: 1531; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1532; AVX-NEXT: retq 1533 %a = zext <16 x i8> %x to <16 x i16> 1534 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1535 %c = trunc <16 x i16> %b to <16 x i8> 1536 ret <16 x i8> %c 1537} 1538 1539define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { 1540; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1541; SSE: # %bb.0: 1542; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1543; SSE-NEXT: psubb %xmm0, %xmm1 1544; SSE-NEXT: movdqa %xmm1, %xmm0 1545; SSE-NEXT: retq 1546; 1547; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1548; AVX: # %bb.0: 1549; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1550; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1551; AVX-NEXT: retq 1552 %a = zext <16 x i8> %x to <16 x i16> 1553 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1554 %c = trunc <16 x i16> %b to <16 x i8> 1555 ret <16 x i8> %c 1556} 1557 1558; 1559; mul 1560; 1561 1562define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1563; SSE-LABEL: trunc_mul_v4i64_v4i32: 1564; SSE: # %bb.0: 1565; SSE-NEXT: pmuludq %xmm3, %xmm1 1566; SSE-NEXT: pmuludq %xmm2, %xmm0 1567; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1568; SSE-NEXT: retq 1569; 1570; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1571; AVX1: # %bb.0: 1572; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1573; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1574; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1575; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1576; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1577; AVX1-NEXT: vzeroupper 1578; AVX1-NEXT: retq 1579; 1580; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1581; AVX2-SLOW: # %bb.0: 1582; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1583; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1584; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1585; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1586; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1587; AVX2-SLOW-NEXT: vzeroupper 1588; AVX2-SLOW-NEXT: retq 1589; 1590; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: 1591; AVX2-FAST-ALL: # %bb.0: 1592; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1593; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 1594; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 1595; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1596; AVX2-FAST-ALL-NEXT: vzeroupper 1597; AVX2-FAST-ALL-NEXT: retq 1598; 1599; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: 1600; AVX2-FAST-PERLANE: # %bb.0: 1601; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 1602; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1603; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 1604; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1605; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1606; AVX2-FAST-PERLANE-NEXT: vzeroupper 1607; AVX2-FAST-PERLANE-NEXT: retq 1608; 1609; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1610; AVX512F: # %bb.0: 1611; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1612; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1613; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1614; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1615; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1616; AVX512F-NEXT: vzeroupper 1617; AVX512F-NEXT: retq 1618; 1619; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1620; AVX512BW: # %bb.0: 1621; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1622; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1623; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1624; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1625; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1626; AVX512BW-NEXT: vzeroupper 1627; AVX512BW-NEXT: retq 1628; 1629; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1630; AVX512DQ: # %bb.0: 1631; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1632; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1633; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1634; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1635; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1636; AVX512DQ-NEXT: vzeroupper 1637; AVX512DQ-NEXT: retq 1638 %1 = mul <4 x i64> %a0, %a1 1639 %2 = trunc <4 x i64> %1 to <4 x i32> 1640 ret <4 x i32> %2 1641} 1642 1643define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1644; SSE-LABEL: trunc_mul_v8i64_v8i16: 1645; SSE: # %bb.0: 1646; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] 1647; SSE-NEXT: pslld $16, %xmm6 1648; SSE-NEXT: psrad $16, %xmm6 1649; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] 1650; SSE-NEXT: pslld $16, %xmm4 1651; SSE-NEXT: psrad $16, %xmm4 1652; SSE-NEXT: packssdw %xmm6, %xmm4 1653; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1654; SSE-NEXT: pslld $16, %xmm2 1655; SSE-NEXT: psrad $16, %xmm2 1656; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1657; SSE-NEXT: pslld $16, %xmm0 1658; SSE-NEXT: psrad $16, %xmm0 1659; SSE-NEXT: packssdw %xmm2, %xmm0 1660; SSE-NEXT: pmullw %xmm4, %xmm0 1661; SSE-NEXT: retq 1662; 1663; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1664; AVX1: # %bb.0: 1665; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] 1666; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1667; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1668; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1669; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1670; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1671; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1672; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1673; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1674; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1675; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1676; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1677; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1678; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1679; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1680; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1681; AVX1-NEXT: vzeroupper 1682; AVX1-NEXT: retq 1683; 1684; AVX2-LABEL: trunc_mul_v8i64_v8i16: 1685; AVX2: # %bb.0: 1686; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1687; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15] 1688; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15] 1689; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1690; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1691; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1692; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15] 1693; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15] 1694; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1695; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1696; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1697; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1698; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1699; AVX2-NEXT: vzeroupper 1700; AVX2-NEXT: retq 1701; 1702; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1703; AVX512F: # %bb.0: 1704; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1705; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1706; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1707; AVX512F-NEXT: vzeroupper 1708; AVX512F-NEXT: retq 1709; 1710; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1711; AVX512BW: # %bb.0: 1712; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1713; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1714; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1715; AVX512BW-NEXT: vzeroupper 1716; AVX512BW-NEXT: retq 1717; 1718; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1719; AVX512DQ: # %bb.0: 1720; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1721; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1722; AVX512DQ-NEXT: vzeroupper 1723; AVX512DQ-NEXT: retq 1724 %1 = mul <8 x i64> %a0, %a1 1725 %2 = trunc <8 x i64> %1 to <8 x i16> 1726 ret <8 x i16> %2 1727} 1728 1729define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1730; SSE-LABEL: trunc_mul_v8i32_v8i16: 1731; SSE: # %bb.0: 1732; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1733; SSE-NEXT: pmuludq %xmm2, %xmm0 1734; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1735; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1736; SSE-NEXT: pmuludq %xmm4, %xmm2 1737; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1738; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1739; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1740; SSE-NEXT: pmuludq %xmm3, %xmm1 1741; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1742; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1743; SSE-NEXT: pmuludq %xmm2, %xmm3 1744; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1745; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1746; SSE-NEXT: pslld $16, %xmm1 1747; SSE-NEXT: psrad $16, %xmm1 1748; SSE-NEXT: pslld $16, %xmm0 1749; SSE-NEXT: psrad $16, %xmm0 1750; SSE-NEXT: packssdw %xmm1, %xmm0 1751; SSE-NEXT: retq 1752; 1753; AVX1-LABEL: trunc_mul_v8i32_v8i16: 1754; AVX1: # %bb.0: 1755; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1756; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1757; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1758; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1759; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1760; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1761; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 1762; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1763; AVX1-NEXT: vzeroupper 1764; AVX1-NEXT: retq 1765; 1766; AVX2-LABEL: trunc_mul_v8i32_v8i16: 1767; AVX2: # %bb.0: 1768; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1769; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1770; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1771; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1772; AVX2-NEXT: vzeroupper 1773; AVX2-NEXT: retq 1774; 1775; AVX512-LABEL: trunc_mul_v8i32_v8i16: 1776; AVX512: # %bb.0: 1777; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1778; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1779; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1780; AVX512-NEXT: vzeroupper 1781; AVX512-NEXT: retq 1782 %1 = mul <8 x i32> %a0, %a1 1783 %2 = trunc <8 x i32> %1 to <8 x i16> 1784 ret <8 x i16> %2 1785} 1786 1787define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1788; SSE-LABEL: trunc_mul_v16i64_v16i8: 1789; SSE: # %bb.0: 1790; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 1791; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 1792; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 1793; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 1794; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 1795; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 1796; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 1797; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 1798; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1799; SSE-NEXT: pand %xmm8, %xmm7 1800; SSE-NEXT: pand %xmm8, %xmm6 1801; SSE-NEXT: packuswb %xmm7, %xmm6 1802; SSE-NEXT: pand %xmm8, %xmm5 1803; SSE-NEXT: pand %xmm8, %xmm4 1804; SSE-NEXT: packuswb %xmm5, %xmm4 1805; SSE-NEXT: packuswb %xmm6, %xmm4 1806; SSE-NEXT: pand %xmm8, %xmm3 1807; SSE-NEXT: pand %xmm8, %xmm2 1808; SSE-NEXT: packuswb %xmm3, %xmm2 1809; SSE-NEXT: pand %xmm8, %xmm1 1810; SSE-NEXT: pand %xmm8, %xmm0 1811; SSE-NEXT: packuswb %xmm1, %xmm0 1812; SSE-NEXT: packuswb %xmm2, %xmm0 1813; SSE-NEXT: packuswb %xmm4, %xmm0 1814; SSE-NEXT: retq 1815; 1816; AVX1-LABEL: trunc_mul_v16i64_v16i8: 1817; AVX1: # %bb.0: 1818; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 1819; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1820; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1821; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 1822; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 1823; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1824; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1825; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 1826; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 1827; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1828; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1829; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 1830; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 1831; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1832; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1833; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 1834; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] 1835; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1836; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1837; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1838; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1839; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1840; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1841; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1842; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1843; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1844; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1845; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1846; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1847; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1848; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1849; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1850; AVX1-NEXT: vzeroupper 1851; AVX1-NEXT: retq 1852; 1853; AVX2-LABEL: trunc_mul_v16i64_v16i8: 1854; AVX2: # %bb.0: 1855; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 1856; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 1857; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 1858; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 1859; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1860; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1861; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1862; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1863; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1864; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1865; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1866; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1867; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1868; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1869; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1870; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1871; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1872; AVX2-NEXT: vzeroupper 1873; AVX2-NEXT: retq 1874; 1875; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 1876; AVX512F: # %bb.0: 1877; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1878; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1879; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 1880; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1881; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1882; AVX512F-NEXT: vzeroupper 1883; AVX512F-NEXT: retq 1884; 1885; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 1886; AVX512BW: # %bb.0: 1887; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1888; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1889; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 1890; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1891; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1892; AVX512BW-NEXT: vzeroupper 1893; AVX512BW-NEXT: retq 1894; 1895; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 1896; AVX512DQ: # %bb.0: 1897; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 1898; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 1899; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 1900; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 1901; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1902; AVX512DQ-NEXT: vzeroupper 1903; AVX512DQ-NEXT: retq 1904 %1 = mul <16 x i64> %a0, %a1 1905 %2 = trunc <16 x i64> %1 to <16 x i8> 1906 ret <16 x i8> %2 1907} 1908 1909define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1910; SSE-LABEL: trunc_mul_v16i32_v16i8: 1911; SSE: # %bb.0: 1912; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 1913; SSE-NEXT: pmuludq %xmm4, %xmm0 1914; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1915; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1916; SSE-NEXT: pmuludq %xmm8, %xmm4 1917; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1918; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1919; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1920; SSE-NEXT: pmuludq %xmm5, %xmm1 1921; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1922; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1923; SSE-NEXT: pmuludq %xmm4, %xmm5 1924; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1925; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1926; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 1927; SSE-NEXT: pmuludq %xmm6, %xmm2 1928; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1929; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 1930; SSE-NEXT: pmuludq %xmm4, %xmm5 1931; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1932; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 1933; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1934; SSE-NEXT: pmuludq %xmm7, %xmm3 1935; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1936; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1937; SSE-NEXT: pmuludq %xmm4, %xmm5 1938; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1939; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1940; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1941; SSE-NEXT: pand %xmm4, %xmm3 1942; SSE-NEXT: pand %xmm4, %xmm2 1943; SSE-NEXT: packuswb %xmm3, %xmm2 1944; SSE-NEXT: pand %xmm4, %xmm1 1945; SSE-NEXT: pand %xmm4, %xmm0 1946; SSE-NEXT: packuswb %xmm1, %xmm0 1947; SSE-NEXT: packuswb %xmm2, %xmm0 1948; SSE-NEXT: retq 1949; 1950; AVX1-LABEL: trunc_mul_v16i32_v16i8: 1951; AVX1: # %bb.0: 1952; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 1953; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1954; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1955; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 1956; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 1957; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1958; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1959; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1960; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] 1961; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1962; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1963; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1964; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1965; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1966; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1967; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1968; AVX1-NEXT: vzeroupper 1969; AVX1-NEXT: retq 1970; 1971; AVX2-LABEL: trunc_mul_v16i32_v16i8: 1972; AVX2: # %bb.0: 1973; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 1974; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 1975; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1976; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1977; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1978; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1979; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1980; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1981; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1982; AVX2-NEXT: vzeroupper 1983; AVX2-NEXT: retq 1984; 1985; AVX512-LABEL: trunc_mul_v16i32_v16i8: 1986; AVX512: # %bb.0: 1987; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1988; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1989; AVX512-NEXT: vzeroupper 1990; AVX512-NEXT: retq 1991 %1 = mul <16 x i32> %a0, %a1 1992 %2 = trunc <16 x i32> %1 to <16 x i8> 1993 ret <16 x i8> %2 1994} 1995 1996define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1997; SSE-LABEL: trunc_mul_v16i16_v16i8: 1998; SSE: # %bb.0: 1999; SSE-NEXT: pmullw %xmm2, %xmm0 2000; SSE-NEXT: pmullw %xmm3, %xmm1 2001; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2002; SSE-NEXT: pand %xmm2, %xmm1 2003; SSE-NEXT: pand %xmm2, %xmm0 2004; SSE-NEXT: packuswb %xmm1, %xmm0 2005; SSE-NEXT: retq 2006; 2007; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2008; AVX1: # %bb.0: 2009; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2010; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2011; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2012; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2013; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2014; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 2015; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 2016; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2017; AVX1-NEXT: vzeroupper 2018; AVX1-NEXT: retq 2019; 2020; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2021; AVX2: # %bb.0: 2022; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2023; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2024; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2025; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2026; AVX2-NEXT: vzeroupper 2027; AVX2-NEXT: retq 2028; 2029; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2030; AVX512F: # %bb.0: 2031; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2032; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2033; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2034; AVX512F-NEXT: vzeroupper 2035; AVX512F-NEXT: retq 2036; 2037; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2038; AVX512BW: # %bb.0: 2039; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2040; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2041; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2042; AVX512BW-NEXT: vzeroupper 2043; AVX512BW-NEXT: retq 2044; 2045; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2046; AVX512DQ: # %bb.0: 2047; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2048; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2049; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2050; AVX512DQ-NEXT: vzeroupper 2051; AVX512DQ-NEXT: retq 2052 %1 = mul <16 x i16> %a0, %a1 2053 %2 = trunc <16 x i16> %1 to <16 x i8> 2054 ret <16 x i8> %2 2055} 2056 2057define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2058; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2059; SSE: # %bb.0: 2060; SSE-NEXT: pxor %xmm3, %xmm3 2061; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2062; SSE-NEXT: pslld $16, %xmm2 2063; SSE-NEXT: psrad $16, %xmm2 2064; SSE-NEXT: pslld $16, %xmm1 2065; SSE-NEXT: psrad $16, %xmm1 2066; SSE-NEXT: packssdw %xmm2, %xmm1 2067; SSE-NEXT: pmullw %xmm1, %xmm0 2068; SSE-NEXT: retq 2069; 2070; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2071; AVX1: # %bb.0: 2072; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2073; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2074; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2075; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2076; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2077; AVX1-NEXT: vzeroupper 2078; AVX1-NEXT: retq 2079; 2080; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2081; AVX2: # %bb.0: 2082; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2083; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2084; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2085; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2086; AVX2-NEXT: vzeroupper 2087; AVX2-NEXT: retq 2088; 2089; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2090; AVX512: # %bb.0: 2091; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2092; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2093; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2094; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2095; AVX512-NEXT: vzeroupper 2096; AVX512-NEXT: retq 2097 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2098 %2 = zext <8 x i8> %1 to <8 x i32> 2099 %3 = mul <8 x i32> %2, %a1 2100 %4 = trunc <8 x i32> %3 to <8 x i16> 2101 ret <8 x i16> %4 2102} 2103 2104; 2105; mul to constant 2106; 2107 2108define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2109; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2110; SSE: # %bb.0: 2111; SSE-NEXT: xorps %xmm2, %xmm2 2112; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2113; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2114; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2115; SSE-NEXT: movaps %xmm2, %xmm0 2116; SSE-NEXT: retq 2117; 2118; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2119; AVX1: # %bb.0: 2120; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2121; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2122; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2123; AVX1-NEXT: vzeroupper 2124; AVX1-NEXT: retq 2125; 2126; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2127; AVX2-SLOW: # %bb.0: 2128; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2129; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2130; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2131; AVX2-SLOW-NEXT: vzeroupper 2132; AVX2-SLOW-NEXT: retq 2133; 2134; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: 2135; AVX2-FAST-ALL: # %bb.0: 2136; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 2137; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2138; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2139; AVX2-FAST-ALL-NEXT: vzeroupper 2140; AVX2-FAST-ALL-NEXT: retq 2141; 2142; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: 2143; AVX2-FAST-PERLANE: # %bb.0: 2144; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2145; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2146; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2147; AVX2-FAST-PERLANE-NEXT: vzeroupper 2148; AVX2-FAST-PERLANE-NEXT: retq 2149; 2150; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2151; AVX512: # %bb.0: 2152; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2153; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2154; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2155; AVX512-NEXT: vzeroupper 2156; AVX512-NEXT: retq 2157 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2158 %2 = trunc <4 x i64> %1 to <4 x i32> 2159 ret <4 x i32> %2 2160} 2161 2162define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2163; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2164; SSE: # %bb.0: 2165; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 2166; SSE-NEXT: pslld $16, %xmm2 2167; SSE-NEXT: psrad $16, %xmm2 2168; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2169; SSE-NEXT: pslld $16, %xmm0 2170; SSE-NEXT: psrad $16, %xmm0 2171; SSE-NEXT: packssdw %xmm2, %xmm0 2172; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7] 2173; SSE-NEXT: retq 2174; 2175; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2176; AVX1: # %bb.0: 2177; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 2178; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2179; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2180; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2181; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2182; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2183; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2184; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2185; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2186; AVX1-NEXT: vzeroupper 2187; AVX1-NEXT: retq 2188; 2189; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: 2190; AVX2: # %bb.0: 2191; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2192; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2193; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2194; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2195; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2196; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2197; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2198; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2199; AVX2-NEXT: vzeroupper 2200; AVX2-NEXT: retq 2201; 2202; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2203; AVX512: # %bb.0: 2204; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2205; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2206; AVX512-NEXT: vzeroupper 2207; AVX512-NEXT: retq 2208 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2209 %2 = trunc <8 x i64> %1 to <8 x i16> 2210 ret <8 x i16> %2 2211} 2212 2213define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2214; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2215; SSE: # %bb.0: 2216; SSE-NEXT: pslld $16, %xmm1 2217; SSE-NEXT: psrad $16, %xmm1 2218; SSE-NEXT: pslld $16, %xmm0 2219; SSE-NEXT: psrad $16, %xmm0 2220; SSE-NEXT: packssdw %xmm1, %xmm0 2221; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7] 2222; SSE-NEXT: retq 2223; 2224; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2225; AVX1: # %bb.0: 2226; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2227; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2228; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2229; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2230; AVX1-NEXT: vzeroupper 2231; AVX1-NEXT: retq 2232; 2233; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2234; AVX2: # %bb.0: 2235; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2236; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2237; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2238; AVX2-NEXT: vzeroupper 2239; AVX2-NEXT: retq 2240; 2241; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2242; AVX512: # %bb.0: 2243; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2244; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2245; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3,4,5,6,7] 2246; AVX512-NEXT: vzeroupper 2247; AVX512-NEXT: retq 2248 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2249 %2 = trunc <8 x i32> %1 to <8 x i16> 2250 ret <8 x i16> %2 2251} 2252 2253define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2254; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2255; SSE: # %bb.0: 2256; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2257; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2258; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2259; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2260; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 2261; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 2262; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 2263; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2264; SSE-NEXT: pand %xmm8, %xmm7 2265; SSE-NEXT: pand %xmm8, %xmm6 2266; SSE-NEXT: packuswb %xmm7, %xmm6 2267; SSE-NEXT: pand %xmm8, %xmm5 2268; SSE-NEXT: pand %xmm8, %xmm4 2269; SSE-NEXT: packuswb %xmm5, %xmm4 2270; SSE-NEXT: packuswb %xmm6, %xmm4 2271; SSE-NEXT: pand %xmm8, %xmm3 2272; SSE-NEXT: pand %xmm8, %xmm2 2273; SSE-NEXT: packuswb %xmm3, %xmm2 2274; SSE-NEXT: pand %xmm8, %xmm1 2275; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2276; SSE-NEXT: packuswb %xmm1, %xmm0 2277; SSE-NEXT: packuswb %xmm2, %xmm0 2278; SSE-NEXT: packuswb %xmm4, %xmm0 2279; SSE-NEXT: retq 2280; 2281; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2282; AVX1: # %bb.0: 2283; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 2284; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2285; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2286; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 2287; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2288; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2289; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 2290; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2291; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2292; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 2293; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2294; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2295; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm8 = [255,255] 2296; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 2297; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 2298; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 2299; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 2300; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 2301; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 2302; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2303; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 2304; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm3 2305; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2306; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 2307; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2308; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2309; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2310; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2311; AVX1-NEXT: vzeroupper 2312; AVX1-NEXT: retq 2313; 2314; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: 2315; AVX2: # %bb.0: 2316; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2317; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2318; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2319; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2320; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2321; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2322; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2323; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2324; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2325; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 2326; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2327; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2328; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2329; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2330; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2331; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2332; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2333; AVX2-NEXT: vzeroupper 2334; AVX2-NEXT: retq 2335; 2336; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: 2337; AVX512F: # %bb.0: 2338; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2339; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2340; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2341; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2342; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2343; AVX512F-NEXT: vzeroupper 2344; AVX512F-NEXT: retq 2345; 2346; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: 2347; AVX512BW: # %bb.0: 2348; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2349; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2350; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2351; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2352; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2353; AVX512BW-NEXT: vzeroupper 2354; AVX512BW-NEXT: retq 2355; 2356; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: 2357; AVX512DQ: # %bb.0: 2358; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2359; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2360; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2361; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2362; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2363; AVX512DQ-NEXT: vzeroupper 2364; AVX512DQ-NEXT: retq 2365 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2366 %2 = trunc <16 x i64> %1 to <16 x i8> 2367 ret <16 x i8> %2 2368} 2369 2370define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2371; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2372; SSE: # %bb.0: 2373; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2374; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2375; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2376; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2377; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2378; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2379; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2380; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2381; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2382; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2383; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2384; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2385; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2386; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2387; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2388; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2389; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2390; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2391; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2392; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2393; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2394; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2395; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2396; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2397; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2398; SSE-NEXT: pand %xmm4, %xmm3 2399; SSE-NEXT: pand %xmm4, %xmm2 2400; SSE-NEXT: packuswb %xmm3, %xmm2 2401; SSE-NEXT: pand %xmm4, %xmm1 2402; SSE-NEXT: pand %xmm4, %xmm0 2403; SSE-NEXT: packuswb %xmm1, %xmm0 2404; SSE-NEXT: packuswb %xmm2, %xmm0 2405; SSE-NEXT: retq 2406; 2407; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2408; AVX1: # %bb.0: 2409; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2410; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2411; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2412; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 2413; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2414; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2415; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255] 2416; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2417; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2418; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2419; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2420; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2421; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2422; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2423; AVX1-NEXT: vzeroupper 2424; AVX1-NEXT: retq 2425; 2426; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2427; AVX2: # %bb.0: 2428; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2429; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2430; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2431; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2432; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2433; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2434; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2435; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2436; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2437; AVX2-NEXT: vzeroupper 2438; AVX2-NEXT: retq 2439; 2440; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2441; AVX512: # %bb.0: 2442; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2443; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2444; AVX512-NEXT: vzeroupper 2445; AVX512-NEXT: retq 2446 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2447 %2 = trunc <16 x i32> %1 to <16 x i8> 2448 ret <16 x i8> %2 2449} 2450 2451define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2452; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2453; SSE: # %bb.0: 2454; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3,4,5,6,7] 2455; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,9,10,11,12,13,14,15] 2456; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2457; SSE-NEXT: pand %xmm2, %xmm1 2458; SSE-NEXT: pand %xmm2, %xmm0 2459; SSE-NEXT: packuswb %xmm1, %xmm0 2460; SSE-NEXT: retq 2461; 2462; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2463; AVX1: # %bb.0: 2464; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,2,3,4,5,6,7] 2465; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2466; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,9,10,11,12,13,14,15] 2467; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2468; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2469; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2470; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2471; AVX1-NEXT: vzeroupper 2472; AVX1-NEXT: retq 2473; 2474; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2475; AVX2: # %bb.0: 2476; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 2477; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2478; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2479; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2480; AVX2-NEXT: vzeroupper 2481; AVX2-NEXT: retq 2482; 2483; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2484; AVX512F: # %bb.0: 2485; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 2486; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2487; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2488; AVX512F-NEXT: vzeroupper 2489; AVX512F-NEXT: retq 2490; 2491; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2492; AVX512BW: # %bb.0: 2493; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 2494; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2495; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2496; AVX512BW-NEXT: vzeroupper 2497; AVX512BW-NEXT: retq 2498; 2499; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 2500; AVX512DQ: # %bb.0: 2501; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 2502; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2503; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2504; AVX512DQ-NEXT: vzeroupper 2505; AVX512DQ-NEXT: retq 2506 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2507 %2 = trunc <16 x i16> %1 to <16 x i8> 2508 ret <16 x i8> %2 2509} 2510 2511; 2512; and 2513; 2514 2515define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2516; SSE-LABEL: trunc_and_v4i64_v4i32: 2517; SSE: # %bb.0: 2518; SSE-NEXT: andps %xmm3, %xmm1 2519; SSE-NEXT: andps %xmm2, %xmm0 2520; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2521; SSE-NEXT: retq 2522; 2523; AVX1-LABEL: trunc_and_v4i64_v4i32: 2524; AVX1: # %bb.0: 2525; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2526; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2527; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2528; AVX1-NEXT: vzeroupper 2529; AVX1-NEXT: retq 2530; 2531; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 2532; AVX2-SLOW: # %bb.0: 2533; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 2534; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2535; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2536; AVX2-SLOW-NEXT: vzeroupper 2537; AVX2-SLOW-NEXT: retq 2538; 2539; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: 2540; AVX2-FAST-ALL: # %bb.0: 2541; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 2542; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 2543; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 2544; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2545; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2546; AVX2-FAST-ALL-NEXT: vzeroupper 2547; AVX2-FAST-ALL-NEXT: retq 2548; 2549; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: 2550; AVX2-FAST-PERLANE: # %bb.0: 2551; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 2552; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2553; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2554; AVX2-FAST-PERLANE-NEXT: vzeroupper 2555; AVX2-FAST-PERLANE-NEXT: retq 2556; 2557; AVX512-LABEL: trunc_and_v4i64_v4i32: 2558; AVX512: # %bb.0: 2559; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2560; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2561; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2562; AVX512-NEXT: vzeroupper 2563; AVX512-NEXT: retq 2564 %1 = and <4 x i64> %a0, %a1 2565 %2 = trunc <4 x i64> %1 to <4 x i32> 2566 ret <4 x i32> %2 2567} 2568 2569define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 2570; SSE-LABEL: trunc_and_v8i64_v8i16: 2571; SSE: # %bb.0: 2572; SSE-NEXT: andps %xmm5, %xmm1 2573; SSE-NEXT: andps %xmm4, %xmm0 2574; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2575; SSE-NEXT: andps %xmm7, %xmm3 2576; SSE-NEXT: andps %xmm6, %xmm2 2577; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 2578; SSE-NEXT: pslld $16, %xmm2 2579; SSE-NEXT: psrad $16, %xmm2 2580; SSE-NEXT: pslld $16, %xmm0 2581; SSE-NEXT: psrad $16, %xmm0 2582; SSE-NEXT: packssdw %xmm2, %xmm0 2583; SSE-NEXT: retq 2584; 2585; AVX1-LABEL: trunc_and_v8i64_v8i16: 2586; AVX1: # %bb.0: 2587; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2588; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2589; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 2590; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2591; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2592; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2593; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2594; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2595; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2596; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2597; AVX1-NEXT: vzeroupper 2598; AVX1-NEXT: retq 2599; 2600; AVX2-LABEL: trunc_and_v8i64_v8i16: 2601; AVX2: # %bb.0: 2602; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2603; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2604; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2605; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2606; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2607; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2608; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2609; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2610; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2611; AVX2-NEXT: vzeroupper 2612; AVX2-NEXT: retq 2613; 2614; AVX512-LABEL: trunc_and_v8i64_v8i16: 2615; AVX512: # %bb.0: 2616; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 2617; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2618; AVX512-NEXT: vzeroupper 2619; AVX512-NEXT: retq 2620 %1 = and <8 x i64> %a0, %a1 2621 %2 = trunc <8 x i64> %1 to <8 x i16> 2622 ret <8 x i16> %2 2623} 2624 2625define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 2626; SSE-LABEL: trunc_and_v8i32_v8i16: 2627; SSE: # %bb.0: 2628; SSE-NEXT: pand %xmm2, %xmm0 2629; SSE-NEXT: pand %xmm3, %xmm1 2630; SSE-NEXT: pslld $16, %xmm1 2631; SSE-NEXT: psrad $16, %xmm1 2632; SSE-NEXT: pslld $16, %xmm0 2633; SSE-NEXT: psrad $16, %xmm0 2634; SSE-NEXT: packssdw %xmm1, %xmm0 2635; SSE-NEXT: retq 2636; 2637; AVX1-LABEL: trunc_and_v8i32_v8i16: 2638; AVX1: # %bb.0: 2639; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2640; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2641; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2642; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2643; AVX1-NEXT: vzeroupper 2644; AVX1-NEXT: retq 2645; 2646; AVX2-LABEL: trunc_and_v8i32_v8i16: 2647; AVX2: # %bb.0: 2648; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2649; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2650; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2651; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2652; AVX2-NEXT: vzeroupper 2653; AVX2-NEXT: retq 2654; 2655; AVX512-LABEL: trunc_and_v8i32_v8i16: 2656; AVX512: # %bb.0: 2657; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2658; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2659; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2660; AVX512-NEXT: vzeroupper 2661; AVX512-NEXT: retq 2662 %1 = and <8 x i32> %a0, %a1 2663 %2 = trunc <8 x i32> %1 to <8 x i16> 2664 ret <8 x i16> %2 2665} 2666 2667define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2668; SSE-LABEL: trunc_and_v16i64_v16i8: 2669; SSE: # %bb.0: 2670; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 2671; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 2672; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 2673; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 2674; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 2675; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 2676; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 2677; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 2678; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2679; SSE-NEXT: pand %xmm8, %xmm7 2680; SSE-NEXT: pand %xmm8, %xmm6 2681; SSE-NEXT: packuswb %xmm7, %xmm6 2682; SSE-NEXT: pand %xmm8, %xmm5 2683; SSE-NEXT: pand %xmm8, %xmm4 2684; SSE-NEXT: packuswb %xmm5, %xmm4 2685; SSE-NEXT: packuswb %xmm6, %xmm4 2686; SSE-NEXT: pand %xmm8, %xmm3 2687; SSE-NEXT: pand %xmm8, %xmm2 2688; SSE-NEXT: packuswb %xmm3, %xmm2 2689; SSE-NEXT: pand %xmm8, %xmm1 2690; SSE-NEXT: pand %xmm8, %xmm0 2691; SSE-NEXT: packuswb %xmm1, %xmm0 2692; SSE-NEXT: packuswb %xmm2, %xmm0 2693; SSE-NEXT: packuswb %xmm4, %xmm0 2694; SSE-NEXT: retq 2695; 2696; AVX1-LABEL: trunc_and_v16i64_v16i8: 2697; AVX1: # %bb.0: 2698; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 2699; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 2700; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 2701; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 2702; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 2703; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2704; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 2705; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 2706; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2707; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 2708; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 2709; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2710; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 2711; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2712; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2713; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 2714; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2715; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2716; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2717; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2718; AVX1-NEXT: vzeroupper 2719; AVX1-NEXT: retq 2720; 2721; AVX2-LABEL: trunc_and_v16i64_v16i8: 2722; AVX2: # %bb.0: 2723; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2724; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 2725; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 2726; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 2727; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2728; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2729; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2730; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2731; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2732; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 2733; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2734; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2735; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2736; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2737; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2738; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2739; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2740; AVX2-NEXT: vzeroupper 2741; AVX2-NEXT: retq 2742; 2743; AVX512-LABEL: trunc_and_v16i64_v16i8: 2744; AVX512: # %bb.0: 2745; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 2746; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 2747; AVX512-NEXT: vpmovqb %zmm1, %xmm1 2748; AVX512-NEXT: vpmovqb %zmm0, %xmm0 2749; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2750; AVX512-NEXT: vzeroupper 2751; AVX512-NEXT: retq 2752 %1 = and <16 x i64> %a0, %a1 2753 %2 = trunc <16 x i64> %1 to <16 x i8> 2754 ret <16 x i8> %2 2755} 2756 2757define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2758; SSE-LABEL: trunc_and_v16i32_v16i8: 2759; SSE: # %bb.0: 2760; SSE-NEXT: pand %xmm4, %xmm0 2761; SSE-NEXT: pand %xmm5, %xmm1 2762; SSE-NEXT: pand %xmm6, %xmm2 2763; SSE-NEXT: pand %xmm7, %xmm3 2764; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2765; SSE-NEXT: pand %xmm4, %xmm3 2766; SSE-NEXT: pand %xmm4, %xmm2 2767; SSE-NEXT: packuswb %xmm3, %xmm2 2768; SSE-NEXT: pand %xmm4, %xmm1 2769; SSE-NEXT: pand %xmm4, %xmm0 2770; SSE-NEXT: packuswb %xmm1, %xmm0 2771; SSE-NEXT: packuswb %xmm2, %xmm0 2772; SSE-NEXT: retq 2773; 2774; AVX1-LABEL: trunc_and_v16i32_v16i8: 2775; AVX1: # %bb.0: 2776; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2777; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2778; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2779; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2780; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2781; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2782; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2783; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2784; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2785; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2786; AVX1-NEXT: vzeroupper 2787; AVX1-NEXT: retq 2788; 2789; AVX2-LABEL: trunc_and_v16i32_v16i8: 2790; AVX2: # %bb.0: 2791; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2792; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2793; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2794; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2795; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2796; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2797; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2798; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2799; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2800; AVX2-NEXT: vzeroupper 2801; AVX2-NEXT: retq 2802; 2803; AVX512-LABEL: trunc_and_v16i32_v16i8: 2804; AVX512: # %bb.0: 2805; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 2806; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2807; AVX512-NEXT: vzeroupper 2808; AVX512-NEXT: retq 2809 %1 = and <16 x i32> %a0, %a1 2810 %2 = trunc <16 x i32> %1 to <16 x i8> 2811 ret <16 x i8> %2 2812} 2813 2814define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2815; SSE-LABEL: trunc_and_v16i16_v16i8: 2816; SSE: # %bb.0: 2817; SSE-NEXT: pand %xmm2, %xmm0 2818; SSE-NEXT: pand %xmm3, %xmm1 2819; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2820; SSE-NEXT: pand %xmm2, %xmm1 2821; SSE-NEXT: pand %xmm2, %xmm0 2822; SSE-NEXT: packuswb %xmm1, %xmm0 2823; SSE-NEXT: retq 2824; 2825; AVX1-LABEL: trunc_and_v16i16_v16i8: 2826; AVX1: # %bb.0: 2827; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2828; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2829; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2830; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2831; AVX1-NEXT: vzeroupper 2832; AVX1-NEXT: retq 2833; 2834; AVX2-LABEL: trunc_and_v16i16_v16i8: 2835; AVX2: # %bb.0: 2836; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2837; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2838; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2839; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2840; AVX2-NEXT: vzeroupper 2841; AVX2-NEXT: retq 2842; 2843; AVX512F-LABEL: trunc_and_v16i16_v16i8: 2844; AVX512F: # %bb.0: 2845; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 2846; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2847; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2848; AVX512F-NEXT: vzeroupper 2849; AVX512F-NEXT: retq 2850; 2851; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 2852; AVX512BW: # %bb.0: 2853; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 2854; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2855; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2856; AVX512BW-NEXT: vzeroupper 2857; AVX512BW-NEXT: retq 2858; 2859; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 2860; AVX512DQ: # %bb.0: 2861; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 2862; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2863; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2864; AVX512DQ-NEXT: vzeroupper 2865; AVX512DQ-NEXT: retq 2866 %1 = and <16 x i16> %a0, %a1 2867 %2 = trunc <16 x i16> %1 to <16 x i8> 2868 ret <16 x i8> %2 2869} 2870 2871; 2872; and to constant 2873; 2874 2875define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2876; SSE-LABEL: trunc_and_const_v4i64_v4i32: 2877; SSE: # %bb.0: 2878; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2879; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2880; SSE-NEXT: retq 2881; 2882; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 2883; AVX1: # %bb.0: 2884; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2885; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2886; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2887; AVX1-NEXT: vzeroupper 2888; AVX1-NEXT: retq 2889; 2890; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 2891; AVX2-SLOW: # %bb.0: 2892; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2893; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2894; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2895; AVX2-SLOW-NEXT: vzeroupper 2896; AVX2-SLOW-NEXT: retq 2897; 2898; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: 2899; AVX2-FAST-ALL: # %bb.0: 2900; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 2901; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 2902; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2903; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2904; AVX2-FAST-ALL-NEXT: vzeroupper 2905; AVX2-FAST-ALL-NEXT: retq 2906; 2907; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: 2908; AVX2-FAST-PERLANE: # %bb.0: 2909; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2910; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2911; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2912; AVX2-FAST-PERLANE-NEXT: vzeroupper 2913; AVX2-FAST-PERLANE-NEXT: retq 2914; 2915; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 2916; AVX512: # %bb.0: 2917; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2918; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2919; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2920; AVX512-NEXT: vzeroupper 2921; AVX512-NEXT: retq 2922 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2923 %2 = trunc <4 x i64> %1 to <4 x i32> 2924 ret <4 x i32> %2 2925} 2926 2927define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2928; SSE-LABEL: trunc_and_const_v8i64_v8i16: 2929; SSE: # %bb.0: 2930; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 2931; SSE-NEXT: pslld $16, %xmm2 2932; SSE-NEXT: psrad $16, %xmm2 2933; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2934; SSE-NEXT: pslld $16, %xmm0 2935; SSE-NEXT: psrad $16, %xmm0 2936; SSE-NEXT: packssdw %xmm2, %xmm0 2937; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2938; SSE-NEXT: retq 2939; 2940; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 2941; AVX1: # %bb.0: 2942; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 2943; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2944; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2945; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2946; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2947; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2948; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2949; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2950; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2951; AVX1-NEXT: vzeroupper 2952; AVX1-NEXT: retq 2953; 2954; AVX2-LABEL: trunc_and_const_v8i64_v8i16: 2955; AVX2: # %bb.0: 2956; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2957; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2958; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2959; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2960; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2961; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2962; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2963; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2964; AVX2-NEXT: vzeroupper 2965; AVX2-NEXT: retq 2966; 2967; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 2968; AVX512: # %bb.0: 2969; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2970; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2971; AVX512-NEXT: vzeroupper 2972; AVX512-NEXT: retq 2973 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2974 %2 = trunc <8 x i64> %1 to <8 x i16> 2975 ret <8 x i16> %2 2976} 2977 2978define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2979; SSE-LABEL: trunc_and_const_v8i32_v8i16: 2980; SSE: # %bb.0: 2981; SSE-NEXT: pslld $16, %xmm1 2982; SSE-NEXT: psrad $16, %xmm1 2983; SSE-NEXT: pslld $16, %xmm0 2984; SSE-NEXT: psrad $16, %xmm0 2985; SSE-NEXT: packssdw %xmm1, %xmm0 2986; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2987; SSE-NEXT: retq 2988; 2989; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 2990; AVX1: # %bb.0: 2991; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2992; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2993; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2994; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2995; AVX1-NEXT: vzeroupper 2996; AVX1-NEXT: retq 2997; 2998; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 2999; AVX2: # %bb.0: 3000; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3001; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3002; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3003; AVX2-NEXT: vzeroupper 3004; AVX2-NEXT: retq 3005; 3006; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3007; AVX512: # %bb.0: 3008; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3009; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3010; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3011; AVX512-NEXT: vzeroupper 3012; AVX512-NEXT: retq 3013 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3014 %2 = trunc <8 x i32> %1 to <8 x i16> 3015 ret <8 x i16> %2 3016} 3017 3018define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3019; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3020; SSE: # %bb.0: 3021; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3022; SSE-NEXT: pand %xmm8, %xmm7 3023; SSE-NEXT: pand %xmm8, %xmm6 3024; SSE-NEXT: packuswb %xmm7, %xmm6 3025; SSE-NEXT: pand %xmm8, %xmm5 3026; SSE-NEXT: pand %xmm8, %xmm4 3027; SSE-NEXT: packuswb %xmm5, %xmm4 3028; SSE-NEXT: packuswb %xmm6, %xmm4 3029; SSE-NEXT: pand %xmm8, %xmm3 3030; SSE-NEXT: pand %xmm8, %xmm2 3031; SSE-NEXT: packuswb %xmm3, %xmm2 3032; SSE-NEXT: pand %xmm8, %xmm1 3033; SSE-NEXT: pand %xmm8, %xmm0 3034; SSE-NEXT: packuswb %xmm1, %xmm0 3035; SSE-NEXT: packuswb %xmm2, %xmm0 3036; SSE-NEXT: packuswb %xmm4, %xmm0 3037; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3038; SSE-NEXT: retq 3039; 3040; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3041; AVX1: # %bb.0: 3042; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 3043; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3044; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3045; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3046; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3047; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3048; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3049; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3050; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3051; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3052; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3053; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3054; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3055; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3056; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3057; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3058; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3059; AVX1-NEXT: vzeroupper 3060; AVX1-NEXT: retq 3061; 3062; AVX2-LABEL: trunc_and_const_v16i64_v16i8: 3063; AVX2: # %bb.0: 3064; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3065; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3066; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3067; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3068; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3069; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3070; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3071; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3072; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3073; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3074; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3075; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3076; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3077; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3078; AVX2-NEXT: vzeroupper 3079; AVX2-NEXT: retq 3080; 3081; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3082; AVX512: # %bb.0: 3083; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3084; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3085; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3086; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3087; AVX512-NEXT: vzeroupper 3088; AVX512-NEXT: retq 3089 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3090 %2 = trunc <16 x i64> %1 to <16 x i8> 3091 ret <16 x i8> %2 3092} 3093 3094define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3095; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3096; SSE: # %bb.0: 3097; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3098; SSE-NEXT: pand %xmm4, %xmm3 3099; SSE-NEXT: pand %xmm4, %xmm2 3100; SSE-NEXT: packuswb %xmm3, %xmm2 3101; SSE-NEXT: pand %xmm4, %xmm1 3102; SSE-NEXT: pand %xmm4, %xmm0 3103; SSE-NEXT: packuswb %xmm1, %xmm0 3104; SSE-NEXT: packuswb %xmm2, %xmm0 3105; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3106; SSE-NEXT: retq 3107; 3108; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3109; AVX1: # %bb.0: 3110; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3111; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3112; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3113; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3114; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3115; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3116; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3117; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3118; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3119; AVX1-NEXT: vzeroupper 3120; AVX1-NEXT: retq 3121; 3122; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3123; AVX2: # %bb.0: 3124; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3125; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3126; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3127; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3128; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3129; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3130; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3131; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3132; AVX2-NEXT: vzeroupper 3133; AVX2-NEXT: retq 3134; 3135; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3136; AVX512: # %bb.0: 3137; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3138; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3139; AVX512-NEXT: vzeroupper 3140; AVX512-NEXT: retq 3141 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3142 %2 = trunc <16 x i32> %1 to <16 x i8> 3143 ret <16 x i8> %2 3144} 3145 3146define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3147; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3148; SSE: # %bb.0: 3149; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3150; SSE-NEXT: pand %xmm2, %xmm1 3151; SSE-NEXT: pand %xmm2, %xmm0 3152; SSE-NEXT: packuswb %xmm1, %xmm0 3153; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3154; SSE-NEXT: retq 3155; 3156; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3157; AVX1: # %bb.0: 3158; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3159; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3160; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3161; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3162; AVX1-NEXT: vzeroupper 3163; AVX1-NEXT: retq 3164; 3165; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3166; AVX2: # %bb.0: 3167; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3168; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3169; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3170; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3171; AVX2-NEXT: vzeroupper 3172; AVX2-NEXT: retq 3173; 3174; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3175; AVX512F: # %bb.0: 3176; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3177; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3178; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3179; AVX512F-NEXT: vzeroupper 3180; AVX512F-NEXT: retq 3181; 3182; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3183; AVX512BW: # %bb.0: 3184; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3185; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3186; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3187; AVX512BW-NEXT: vzeroupper 3188; AVX512BW-NEXT: retq 3189; 3190; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3191; AVX512DQ: # %bb.0: 3192; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3193; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3194; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3195; AVX512DQ-NEXT: vzeroupper 3196; AVX512DQ-NEXT: retq 3197 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3198 %2 = trunc <16 x i16> %1 to <16 x i8> 3199 ret <16 x i8> %2 3200} 3201 3202; 3203; xor 3204; 3205 3206define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3207; SSE-LABEL: trunc_xor_v4i64_v4i32: 3208; SSE: # %bb.0: 3209; SSE-NEXT: xorps %xmm3, %xmm1 3210; SSE-NEXT: xorps %xmm2, %xmm0 3211; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3212; SSE-NEXT: retq 3213; 3214; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3215; AVX1: # %bb.0: 3216; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3218; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3219; AVX1-NEXT: vzeroupper 3220; AVX1-NEXT: retq 3221; 3222; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3223; AVX2-SLOW: # %bb.0: 3224; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3225; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3226; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3227; AVX2-SLOW-NEXT: vzeroupper 3228; AVX2-SLOW-NEXT: retq 3229; 3230; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: 3231; AVX2-FAST-ALL: # %bb.0: 3232; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 3233; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 3234; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 3235; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3236; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3237; AVX2-FAST-ALL-NEXT: vzeroupper 3238; AVX2-FAST-ALL-NEXT: retq 3239; 3240; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: 3241; AVX2-FAST-PERLANE: # %bb.0: 3242; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 3243; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3244; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3245; AVX2-FAST-PERLANE-NEXT: vzeroupper 3246; AVX2-FAST-PERLANE-NEXT: retq 3247; 3248; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3249; AVX512: # %bb.0: 3250; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3251; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3252; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3253; AVX512-NEXT: vzeroupper 3254; AVX512-NEXT: retq 3255 %1 = xor <4 x i64> %a0, %a1 3256 %2 = trunc <4 x i64> %1 to <4 x i32> 3257 ret <4 x i32> %2 3258} 3259 3260define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3261; SSE-LABEL: trunc_xor_v8i64_v8i16: 3262; SSE: # %bb.0: 3263; SSE-NEXT: xorps %xmm5, %xmm1 3264; SSE-NEXT: xorps %xmm4, %xmm0 3265; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3266; SSE-NEXT: xorps %xmm7, %xmm3 3267; SSE-NEXT: xorps %xmm6, %xmm2 3268; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 3269; SSE-NEXT: pslld $16, %xmm2 3270; SSE-NEXT: psrad $16, %xmm2 3271; SSE-NEXT: pslld $16, %xmm0 3272; SSE-NEXT: psrad $16, %xmm0 3273; SSE-NEXT: packssdw %xmm2, %xmm0 3274; SSE-NEXT: retq 3275; 3276; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3277; AVX1: # %bb.0: 3278; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3279; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3280; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 3281; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3282; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3283; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3284; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3285; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3286; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3287; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3288; AVX1-NEXT: vzeroupper 3289; AVX1-NEXT: retq 3290; 3291; AVX2-LABEL: trunc_xor_v8i64_v8i16: 3292; AVX2: # %bb.0: 3293; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3294; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3295; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3296; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3297; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3298; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3299; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3300; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3301; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3302; AVX2-NEXT: vzeroupper 3303; AVX2-NEXT: retq 3304; 3305; AVX512-LABEL: trunc_xor_v8i64_v8i16: 3306; AVX512: # %bb.0: 3307; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3308; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3309; AVX512-NEXT: vzeroupper 3310; AVX512-NEXT: retq 3311 %1 = xor <8 x i64> %a0, %a1 3312 %2 = trunc <8 x i64> %1 to <8 x i16> 3313 ret <8 x i16> %2 3314} 3315 3316define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3317; SSE-LABEL: trunc_xor_v8i32_v8i16: 3318; SSE: # %bb.0: 3319; SSE-NEXT: pxor %xmm2, %xmm0 3320; SSE-NEXT: pxor %xmm3, %xmm1 3321; SSE-NEXT: pslld $16, %xmm1 3322; SSE-NEXT: psrad $16, %xmm1 3323; SSE-NEXT: pslld $16, %xmm0 3324; SSE-NEXT: psrad $16, %xmm0 3325; SSE-NEXT: packssdw %xmm1, %xmm0 3326; SSE-NEXT: retq 3327; 3328; AVX1-LABEL: trunc_xor_v8i32_v8i16: 3329; AVX1: # %bb.0: 3330; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3331; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3332; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3333; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3334; AVX1-NEXT: vzeroupper 3335; AVX1-NEXT: retq 3336; 3337; AVX2-LABEL: trunc_xor_v8i32_v8i16: 3338; AVX2: # %bb.0: 3339; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3340; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3341; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3342; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3343; AVX2-NEXT: vzeroupper 3344; AVX2-NEXT: retq 3345; 3346; AVX512-LABEL: trunc_xor_v8i32_v8i16: 3347; AVX512: # %bb.0: 3348; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3349; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3350; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3351; AVX512-NEXT: vzeroupper 3352; AVX512-NEXT: retq 3353 %1 = xor <8 x i32> %a0, %a1 3354 %2 = trunc <8 x i32> %1 to <8 x i16> 3355 ret <8 x i16> %2 3356} 3357 3358define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3359; SSE-LABEL: trunc_xor_v16i64_v16i8: 3360; SSE: # %bb.0: 3361; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3362; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3363; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3364; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3365; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3366; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3367; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3368; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3369; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3370; SSE-NEXT: pand %xmm8, %xmm7 3371; SSE-NEXT: pand %xmm8, %xmm6 3372; SSE-NEXT: packuswb %xmm7, %xmm6 3373; SSE-NEXT: pand %xmm8, %xmm5 3374; SSE-NEXT: pand %xmm8, %xmm4 3375; SSE-NEXT: packuswb %xmm5, %xmm4 3376; SSE-NEXT: packuswb %xmm6, %xmm4 3377; SSE-NEXT: pand %xmm8, %xmm3 3378; SSE-NEXT: pand %xmm8, %xmm2 3379; SSE-NEXT: packuswb %xmm3, %xmm2 3380; SSE-NEXT: pand %xmm8, %xmm1 3381; SSE-NEXT: pand %xmm8, %xmm0 3382; SSE-NEXT: packuswb %xmm1, %xmm0 3383; SSE-NEXT: packuswb %xmm2, %xmm0 3384; SSE-NEXT: packuswb %xmm4, %xmm0 3385; SSE-NEXT: retq 3386; 3387; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3388; AVX1: # %bb.0: 3389; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3390; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3391; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3392; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3393; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 3394; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3395; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3396; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3397; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3398; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3399; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3400; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3401; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3402; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3403; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3404; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3405; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3406; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3407; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3408; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3409; AVX1-NEXT: vzeroupper 3410; AVX1-NEXT: retq 3411; 3412; AVX2-LABEL: trunc_xor_v16i64_v16i8: 3413; AVX2: # %bb.0: 3414; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 3415; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 3416; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 3417; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 3418; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3419; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3420; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3421; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3422; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3423; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3424; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3425; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3426; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3427; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3428; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3429; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3430; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3431; AVX2-NEXT: vzeroupper 3432; AVX2-NEXT: retq 3433; 3434; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3435; AVX512: # %bb.0: 3436; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3437; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3438; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3439; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3440; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3441; AVX512-NEXT: vzeroupper 3442; AVX512-NEXT: retq 3443 %1 = xor <16 x i64> %a0, %a1 3444 %2 = trunc <16 x i64> %1 to <16 x i8> 3445 ret <16 x i8> %2 3446} 3447 3448define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3449; SSE-LABEL: trunc_xor_v16i32_v16i8: 3450; SSE: # %bb.0: 3451; SSE-NEXT: pxor %xmm4, %xmm0 3452; SSE-NEXT: pxor %xmm5, %xmm1 3453; SSE-NEXT: pxor %xmm6, %xmm2 3454; SSE-NEXT: pxor %xmm7, %xmm3 3455; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3456; SSE-NEXT: pand %xmm4, %xmm3 3457; SSE-NEXT: pand %xmm4, %xmm2 3458; SSE-NEXT: packuswb %xmm3, %xmm2 3459; SSE-NEXT: pand %xmm4, %xmm1 3460; SSE-NEXT: pand %xmm4, %xmm0 3461; SSE-NEXT: packuswb %xmm1, %xmm0 3462; SSE-NEXT: packuswb %xmm2, %xmm0 3463; SSE-NEXT: retq 3464; 3465; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3466; AVX1: # %bb.0: 3467; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3468; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3469; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3470; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3471; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3472; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3473; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3474; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3475; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3476; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3477; AVX1-NEXT: vzeroupper 3478; AVX1-NEXT: retq 3479; 3480; AVX2-LABEL: trunc_xor_v16i32_v16i8: 3481; AVX2: # %bb.0: 3482; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3483; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3484; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3485; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3486; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3487; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3488; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3489; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3490; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3491; AVX2-NEXT: vzeroupper 3492; AVX2-NEXT: retq 3493; 3494; AVX512-LABEL: trunc_xor_v16i32_v16i8: 3495; AVX512: # %bb.0: 3496; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 3497; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3498; AVX512-NEXT: vzeroupper 3499; AVX512-NEXT: retq 3500 %1 = xor <16 x i32> %a0, %a1 3501 %2 = trunc <16 x i32> %1 to <16 x i8> 3502 ret <16 x i8> %2 3503} 3504 3505define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3506; SSE-LABEL: trunc_xor_v16i16_v16i8: 3507; SSE: # %bb.0: 3508; SSE-NEXT: pxor %xmm2, %xmm0 3509; SSE-NEXT: pxor %xmm3, %xmm1 3510; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3511; SSE-NEXT: pand %xmm2, %xmm1 3512; SSE-NEXT: pand %xmm2, %xmm0 3513; SSE-NEXT: packuswb %xmm1, %xmm0 3514; SSE-NEXT: retq 3515; 3516; AVX1-LABEL: trunc_xor_v16i16_v16i8: 3517; AVX1: # %bb.0: 3518; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3519; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3520; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3521; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3522; AVX1-NEXT: vzeroupper 3523; AVX1-NEXT: retq 3524; 3525; AVX2-LABEL: trunc_xor_v16i16_v16i8: 3526; AVX2: # %bb.0: 3527; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3528; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3529; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3530; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3531; AVX2-NEXT: vzeroupper 3532; AVX2-NEXT: retq 3533; 3534; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 3535; AVX512F: # %bb.0: 3536; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 3537; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3538; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3539; AVX512F-NEXT: vzeroupper 3540; AVX512F-NEXT: retq 3541; 3542; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 3543; AVX512BW: # %bb.0: 3544; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 3545; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3546; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3547; AVX512BW-NEXT: vzeroupper 3548; AVX512BW-NEXT: retq 3549; 3550; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 3551; AVX512DQ: # %bb.0: 3552; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 3553; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3554; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3555; AVX512DQ-NEXT: vzeroupper 3556; AVX512DQ-NEXT: retq 3557 %1 = xor <16 x i16> %a0, %a1 3558 %2 = trunc <16 x i16> %1 to <16 x i8> 3559 ret <16 x i8> %2 3560} 3561 3562; 3563; xor to constant 3564; 3565 3566define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3567; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 3568; SSE: # %bb.0: 3569; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3570; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3571; SSE-NEXT: retq 3572; 3573; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 3574; AVX1: # %bb.0: 3575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3576; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3577; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3578; AVX1-NEXT: vzeroupper 3579; AVX1-NEXT: retq 3580; 3581; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 3582; AVX2-SLOW: # %bb.0: 3583; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3584; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3585; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3586; AVX2-SLOW-NEXT: vzeroupper 3587; AVX2-SLOW-NEXT: retq 3588; 3589; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: 3590; AVX2-FAST-ALL: # %bb.0: 3591; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 3592; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 3593; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3594; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3595; AVX2-FAST-ALL-NEXT: vzeroupper 3596; AVX2-FAST-ALL-NEXT: retq 3597; 3598; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: 3599; AVX2-FAST-PERLANE: # %bb.0: 3600; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3601; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3602; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3603; AVX2-FAST-PERLANE-NEXT: vzeroupper 3604; AVX2-FAST-PERLANE-NEXT: retq 3605; 3606; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 3607; AVX512: # %bb.0: 3608; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3609; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3610; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3611; AVX512-NEXT: vzeroupper 3612; AVX512-NEXT: retq 3613 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3614 %2 = trunc <4 x i64> %1 to <4 x i32> 3615 ret <4 x i32> %2 3616} 3617 3618define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3619; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 3620; SSE: # %bb.0: 3621; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 3622; SSE-NEXT: pslld $16, %xmm2 3623; SSE-NEXT: psrad $16, %xmm2 3624; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3625; SSE-NEXT: pslld $16, %xmm0 3626; SSE-NEXT: psrad $16, %xmm0 3627; SSE-NEXT: packssdw %xmm2, %xmm0 3628; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3629; SSE-NEXT: retq 3630; 3631; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 3632; AVX1: # %bb.0: 3633; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 3634; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3635; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3636; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3637; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3638; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3639; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3640; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3641; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3642; AVX1-NEXT: vzeroupper 3643; AVX1-NEXT: retq 3644; 3645; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: 3646; AVX2: # %bb.0: 3647; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3648; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3649; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3650; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3651; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3652; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3653; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3654; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3655; AVX2-NEXT: vzeroupper 3656; AVX2-NEXT: retq 3657; 3658; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 3659; AVX512: # %bb.0: 3660; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3661; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3662; AVX512-NEXT: vzeroupper 3663; AVX512-NEXT: retq 3664 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3665 %2 = trunc <8 x i64> %1 to <8 x i16> 3666 ret <8 x i16> %2 3667} 3668 3669define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3670; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 3671; SSE: # %bb.0: 3672; SSE-NEXT: pslld $16, %xmm1 3673; SSE-NEXT: psrad $16, %xmm1 3674; SSE-NEXT: pslld $16, %xmm0 3675; SSE-NEXT: psrad $16, %xmm0 3676; SSE-NEXT: packssdw %xmm1, %xmm0 3677; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3678; SSE-NEXT: retq 3679; 3680; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 3681; AVX1: # %bb.0: 3682; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3683; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3684; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3685; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3686; AVX1-NEXT: vzeroupper 3687; AVX1-NEXT: retq 3688; 3689; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 3690; AVX2: # %bb.0: 3691; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3692; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3693; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3694; AVX2-NEXT: vzeroupper 3695; AVX2-NEXT: retq 3696; 3697; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 3698; AVX512: # %bb.0: 3699; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3700; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3701; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3702; AVX512-NEXT: vzeroupper 3703; AVX512-NEXT: retq 3704 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3705 %2 = trunc <8 x i32> %1 to <8 x i16> 3706 ret <8 x i16> %2 3707} 3708 3709define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3710; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 3711; SSE: # %bb.0: 3712; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3713; SSE-NEXT: pand %xmm8, %xmm7 3714; SSE-NEXT: pand %xmm8, %xmm6 3715; SSE-NEXT: packuswb %xmm7, %xmm6 3716; SSE-NEXT: pand %xmm8, %xmm5 3717; SSE-NEXT: pand %xmm8, %xmm4 3718; SSE-NEXT: packuswb %xmm5, %xmm4 3719; SSE-NEXT: packuswb %xmm6, %xmm4 3720; SSE-NEXT: pand %xmm8, %xmm3 3721; SSE-NEXT: pand %xmm8, %xmm2 3722; SSE-NEXT: packuswb %xmm3, %xmm2 3723; SSE-NEXT: pand %xmm8, %xmm1 3724; SSE-NEXT: pand %xmm8, %xmm0 3725; SSE-NEXT: packuswb %xmm1, %xmm0 3726; SSE-NEXT: packuswb %xmm2, %xmm0 3727; SSE-NEXT: packuswb %xmm4, %xmm0 3728; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3729; SSE-NEXT: retq 3730; 3731; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 3732; AVX1: # %bb.0: 3733; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 3734; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3735; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3736; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3737; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3738; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3739; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3740; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3741; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3742; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3743; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3744; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3745; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3746; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3747; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3748; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3749; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3750; AVX1-NEXT: vzeroupper 3751; AVX1-NEXT: retq 3752; 3753; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: 3754; AVX2: # %bb.0: 3755; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3756; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3757; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3758; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3759; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3760; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3761; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3762; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3763; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3764; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3765; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3766; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3767; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3768; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3769; AVX2-NEXT: vzeroupper 3770; AVX2-NEXT: retq 3771; 3772; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 3773; AVX512: # %bb.0: 3774; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3775; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3776; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3777; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3778; AVX512-NEXT: vzeroupper 3779; AVX512-NEXT: retq 3780 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3781 %2 = trunc <16 x i64> %1 to <16 x i8> 3782 ret <16 x i8> %2 3783} 3784 3785define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3786; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 3787; SSE: # %bb.0: 3788; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3789; SSE-NEXT: pand %xmm4, %xmm3 3790; SSE-NEXT: pand %xmm4, %xmm2 3791; SSE-NEXT: packuswb %xmm3, %xmm2 3792; SSE-NEXT: pand %xmm4, %xmm1 3793; SSE-NEXT: pand %xmm4, %xmm0 3794; SSE-NEXT: packuswb %xmm1, %xmm0 3795; SSE-NEXT: packuswb %xmm2, %xmm0 3796; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3797; SSE-NEXT: retq 3798; 3799; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 3800; AVX1: # %bb.0: 3801; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3802; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3803; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3804; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3805; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3806; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3807; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3808; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3809; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3810; AVX1-NEXT: vzeroupper 3811; AVX1-NEXT: retq 3812; 3813; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 3814; AVX2: # %bb.0: 3815; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3816; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3817; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3818; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3819; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3820; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3821; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3822; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3823; AVX2-NEXT: vzeroupper 3824; AVX2-NEXT: retq 3825; 3826; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 3827; AVX512: # %bb.0: 3828; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3829; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3830; AVX512-NEXT: vzeroupper 3831; AVX512-NEXT: retq 3832 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3833 %2 = trunc <16 x i32> %1 to <16 x i8> 3834 ret <16 x i8> %2 3835} 3836 3837define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3838; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 3839; SSE: # %bb.0: 3840; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3841; SSE-NEXT: pand %xmm2, %xmm1 3842; SSE-NEXT: pand %xmm2, %xmm0 3843; SSE-NEXT: packuswb %xmm1, %xmm0 3844; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3845; SSE-NEXT: retq 3846; 3847; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 3848; AVX1: # %bb.0: 3849; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3850; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3851; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3852; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3853; AVX1-NEXT: vzeroupper 3854; AVX1-NEXT: retq 3855; 3856; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 3857; AVX2: # %bb.0: 3858; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3859; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3860; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3861; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3862; AVX2-NEXT: vzeroupper 3863; AVX2-NEXT: retq 3864; 3865; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 3866; AVX512F: # %bb.0: 3867; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3868; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3869; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3870; AVX512F-NEXT: vzeroupper 3871; AVX512F-NEXT: retq 3872; 3873; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 3874; AVX512BW: # %bb.0: 3875; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3876; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3877; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3878; AVX512BW-NEXT: vzeroupper 3879; AVX512BW-NEXT: retq 3880; 3881; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 3882; AVX512DQ: # %bb.0: 3883; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3884; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3885; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3886; AVX512DQ-NEXT: vzeroupper 3887; AVX512DQ-NEXT: retq 3888 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3889 %2 = trunc <16 x i16> %1 to <16 x i8> 3890 ret <16 x i8> %2 3891} 3892 3893; 3894; or 3895; 3896 3897define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3898; SSE-LABEL: trunc_or_v4i64_v4i32: 3899; SSE: # %bb.0: 3900; SSE-NEXT: orps %xmm3, %xmm1 3901; SSE-NEXT: orps %xmm2, %xmm0 3902; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3903; SSE-NEXT: retq 3904; 3905; AVX1-LABEL: trunc_or_v4i64_v4i32: 3906; AVX1: # %bb.0: 3907; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 3908; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3909; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3910; AVX1-NEXT: vzeroupper 3911; AVX1-NEXT: retq 3912; 3913; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 3914; AVX2-SLOW: # %bb.0: 3915; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 3916; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3917; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3918; AVX2-SLOW-NEXT: vzeroupper 3919; AVX2-SLOW-NEXT: retq 3920; 3921; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: 3922; AVX2-FAST-ALL: # %bb.0: 3923; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 3924; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 3925; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 3926; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3927; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3928; AVX2-FAST-ALL-NEXT: vzeroupper 3929; AVX2-FAST-ALL-NEXT: retq 3930; 3931; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: 3932; AVX2-FAST-PERLANE: # %bb.0: 3933; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 3934; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3935; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3936; AVX2-FAST-PERLANE-NEXT: vzeroupper 3937; AVX2-FAST-PERLANE-NEXT: retq 3938; 3939; AVX512-LABEL: trunc_or_v4i64_v4i32: 3940; AVX512: # %bb.0: 3941; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 3942; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3943; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3944; AVX512-NEXT: vzeroupper 3945; AVX512-NEXT: retq 3946 %1 = or <4 x i64> %a0, %a1 3947 %2 = trunc <4 x i64> %1 to <4 x i32> 3948 ret <4 x i32> %2 3949} 3950 3951define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3952; SSE-LABEL: trunc_or_v8i64_v8i16: 3953; SSE: # %bb.0: 3954; SSE-NEXT: orps %xmm5, %xmm1 3955; SSE-NEXT: orps %xmm4, %xmm0 3956; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3957; SSE-NEXT: orps %xmm7, %xmm3 3958; SSE-NEXT: orps %xmm6, %xmm2 3959; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 3960; SSE-NEXT: pslld $16, %xmm2 3961; SSE-NEXT: psrad $16, %xmm2 3962; SSE-NEXT: pslld $16, %xmm0 3963; SSE-NEXT: psrad $16, %xmm0 3964; SSE-NEXT: packssdw %xmm2, %xmm0 3965; SSE-NEXT: retq 3966; 3967; AVX1-LABEL: trunc_or_v8i64_v8i16: 3968; AVX1: # %bb.0: 3969; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 3970; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 3971; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 3972; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3973; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3974; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3975; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3976; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3977; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3978; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3979; AVX1-NEXT: vzeroupper 3980; AVX1-NEXT: retq 3981; 3982; AVX2-LABEL: trunc_or_v8i64_v8i16: 3983; AVX2: # %bb.0: 3984; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 3985; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 3986; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3987; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3988; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3989; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3990; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3991; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3992; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3993; AVX2-NEXT: vzeroupper 3994; AVX2-NEXT: retq 3995; 3996; AVX512-LABEL: trunc_or_v8i64_v8i16: 3997; AVX512: # %bb.0: 3998; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 3999; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4000; AVX512-NEXT: vzeroupper 4001; AVX512-NEXT: retq 4002 %1 = or <8 x i64> %a0, %a1 4003 %2 = trunc <8 x i64> %1 to <8 x i16> 4004 ret <8 x i16> %2 4005} 4006 4007define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4008; SSE-LABEL: trunc_or_v8i32_v8i16: 4009; SSE: # %bb.0: 4010; SSE-NEXT: por %xmm2, %xmm0 4011; SSE-NEXT: por %xmm3, %xmm1 4012; SSE-NEXT: pslld $16, %xmm1 4013; SSE-NEXT: psrad $16, %xmm1 4014; SSE-NEXT: pslld $16, %xmm0 4015; SSE-NEXT: psrad $16, %xmm0 4016; SSE-NEXT: packssdw %xmm1, %xmm0 4017; SSE-NEXT: retq 4018; 4019; AVX1-LABEL: trunc_or_v8i32_v8i16: 4020; AVX1: # %bb.0: 4021; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4022; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4023; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4024; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4025; AVX1-NEXT: vzeroupper 4026; AVX1-NEXT: retq 4027; 4028; AVX2-LABEL: trunc_or_v8i32_v8i16: 4029; AVX2: # %bb.0: 4030; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4031; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4032; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4033; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4034; AVX2-NEXT: vzeroupper 4035; AVX2-NEXT: retq 4036; 4037; AVX512-LABEL: trunc_or_v8i32_v8i16: 4038; AVX512: # %bb.0: 4039; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4040; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4041; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4042; AVX512-NEXT: vzeroupper 4043; AVX512-NEXT: retq 4044 %1 = or <8 x i32> %a0, %a1 4045 %2 = trunc <8 x i32> %1 to <8 x i16> 4046 ret <8 x i16> %2 4047} 4048 4049define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4050; SSE-LABEL: trunc_or_v16i64_v16i8: 4051; SSE: # %bb.0: 4052; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4053; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4054; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4055; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4056; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4057; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4058; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4059; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4060; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4061; SSE-NEXT: pand %xmm8, %xmm7 4062; SSE-NEXT: pand %xmm8, %xmm6 4063; SSE-NEXT: packuswb %xmm7, %xmm6 4064; SSE-NEXT: pand %xmm8, %xmm5 4065; SSE-NEXT: pand %xmm8, %xmm4 4066; SSE-NEXT: packuswb %xmm5, %xmm4 4067; SSE-NEXT: packuswb %xmm6, %xmm4 4068; SSE-NEXT: pand %xmm8, %xmm3 4069; SSE-NEXT: pand %xmm8, %xmm2 4070; SSE-NEXT: packuswb %xmm3, %xmm2 4071; SSE-NEXT: pand %xmm8, %xmm1 4072; SSE-NEXT: pand %xmm8, %xmm0 4073; SSE-NEXT: packuswb %xmm1, %xmm0 4074; SSE-NEXT: packuswb %xmm2, %xmm0 4075; SSE-NEXT: packuswb %xmm4, %xmm0 4076; SSE-NEXT: retq 4077; 4078; AVX1-LABEL: trunc_or_v16i64_v16i8: 4079; AVX1: # %bb.0: 4080; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4081; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4082; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4083; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4084; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 4085; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4086; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4087; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4088; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4089; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4090; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4091; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4092; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4093; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4094; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4095; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4096; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4097; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4098; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4099; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4100; AVX1-NEXT: vzeroupper 4101; AVX1-NEXT: retq 4102; 4103; AVX2-LABEL: trunc_or_v16i64_v16i8: 4104; AVX2: # %bb.0: 4105; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 4106; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 4107; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 4108; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 4109; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4110; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4111; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4112; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4113; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4114; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4115; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4116; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4117; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4118; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4119; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4120; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4121; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4122; AVX2-NEXT: vzeroupper 4123; AVX2-NEXT: retq 4124; 4125; AVX512-LABEL: trunc_or_v16i64_v16i8: 4126; AVX512: # %bb.0: 4127; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4128; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4129; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4130; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4131; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4132; AVX512-NEXT: vzeroupper 4133; AVX512-NEXT: retq 4134 %1 = or <16 x i64> %a0, %a1 4135 %2 = trunc <16 x i64> %1 to <16 x i8> 4136 ret <16 x i8> %2 4137} 4138 4139define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4140; SSE-LABEL: trunc_or_v16i32_v16i8: 4141; SSE: # %bb.0: 4142; SSE-NEXT: por %xmm4, %xmm0 4143; SSE-NEXT: por %xmm5, %xmm1 4144; SSE-NEXT: por %xmm6, %xmm2 4145; SSE-NEXT: por %xmm7, %xmm3 4146; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4147; SSE-NEXT: pand %xmm4, %xmm3 4148; SSE-NEXT: pand %xmm4, %xmm2 4149; SSE-NEXT: packuswb %xmm3, %xmm2 4150; SSE-NEXT: pand %xmm4, %xmm1 4151; SSE-NEXT: pand %xmm4, %xmm0 4152; SSE-NEXT: packuswb %xmm1, %xmm0 4153; SSE-NEXT: packuswb %xmm2, %xmm0 4154; SSE-NEXT: retq 4155; 4156; AVX1-LABEL: trunc_or_v16i32_v16i8: 4157; AVX1: # %bb.0: 4158; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4159; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4160; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4161; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4162; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4163; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4164; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4165; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4166; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4167; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4168; AVX1-NEXT: vzeroupper 4169; AVX1-NEXT: retq 4170; 4171; AVX2-LABEL: trunc_or_v16i32_v16i8: 4172; AVX2: # %bb.0: 4173; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4174; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4175; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4176; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4177; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4178; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4179; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4180; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4181; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4182; AVX2-NEXT: vzeroupper 4183; AVX2-NEXT: retq 4184; 4185; AVX512-LABEL: trunc_or_v16i32_v16i8: 4186; AVX512: # %bb.0: 4187; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4188; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4189; AVX512-NEXT: vzeroupper 4190; AVX512-NEXT: retq 4191 %1 = or <16 x i32> %a0, %a1 4192 %2 = trunc <16 x i32> %1 to <16 x i8> 4193 ret <16 x i8> %2 4194} 4195 4196define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4197; SSE-LABEL: trunc_or_v16i16_v16i8: 4198; SSE: # %bb.0: 4199; SSE-NEXT: por %xmm2, %xmm0 4200; SSE-NEXT: por %xmm3, %xmm1 4201; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4202; SSE-NEXT: pand %xmm2, %xmm1 4203; SSE-NEXT: pand %xmm2, %xmm0 4204; SSE-NEXT: packuswb %xmm1, %xmm0 4205; SSE-NEXT: retq 4206; 4207; AVX1-LABEL: trunc_or_v16i16_v16i8: 4208; AVX1: # %bb.0: 4209; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4210; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4211; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4212; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4213; AVX1-NEXT: vzeroupper 4214; AVX1-NEXT: retq 4215; 4216; AVX2-LABEL: trunc_or_v16i16_v16i8: 4217; AVX2: # %bb.0: 4218; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4219; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4220; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4221; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4222; AVX2-NEXT: vzeroupper 4223; AVX2-NEXT: retq 4224; 4225; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4226; AVX512F: # %bb.0: 4227; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 4228; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4229; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4230; AVX512F-NEXT: vzeroupper 4231; AVX512F-NEXT: retq 4232; 4233; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4234; AVX512BW: # %bb.0: 4235; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 4236; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4237; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4238; AVX512BW-NEXT: vzeroupper 4239; AVX512BW-NEXT: retq 4240; 4241; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 4242; AVX512DQ: # %bb.0: 4243; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 4244; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4245; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4246; AVX512DQ-NEXT: vzeroupper 4247; AVX512DQ-NEXT: retq 4248 %1 = or <16 x i16> %a0, %a1 4249 %2 = trunc <16 x i16> %1 to <16 x i8> 4250 ret <16 x i8> %2 4251} 4252 4253; 4254; or to constant 4255; 4256 4257define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4258; SSE-LABEL: trunc_or_const_v4i64_v4i32: 4259; SSE: # %bb.0: 4260; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4261; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4262; SSE-NEXT: retq 4263; 4264; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 4265; AVX1: # %bb.0: 4266; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4267; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4268; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4269; AVX1-NEXT: vzeroupper 4270; AVX1-NEXT: retq 4271; 4272; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 4273; AVX2-SLOW: # %bb.0: 4274; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4275; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4276; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4277; AVX2-SLOW-NEXT: vzeroupper 4278; AVX2-SLOW-NEXT: retq 4279; 4280; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: 4281; AVX2-FAST-ALL: # %bb.0: 4282; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] 4283; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 4284; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 4285; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4286; AVX2-FAST-ALL-NEXT: vzeroupper 4287; AVX2-FAST-ALL-NEXT: retq 4288; 4289; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: 4290; AVX2-FAST-PERLANE: # %bb.0: 4291; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 4292; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4293; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4294; AVX2-FAST-PERLANE-NEXT: vzeroupper 4295; AVX2-FAST-PERLANE-NEXT: retq 4296; 4297; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 4298; AVX512: # %bb.0: 4299; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4300; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4301; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4302; AVX512-NEXT: vzeroupper 4303; AVX512-NEXT: retq 4304 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4305 %2 = trunc <4 x i64> %1 to <4 x i32> 4306 ret <4 x i32> %2 4307} 4308 4309define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4310; SSE-LABEL: trunc_or_const_v8i64_v8i16: 4311; SSE: # %bb.0: 4312; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 4313; SSE-NEXT: pslld $16, %xmm2 4314; SSE-NEXT: psrad $16, %xmm2 4315; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4316; SSE-NEXT: pslld $16, %xmm0 4317; SSE-NEXT: psrad $16, %xmm0 4318; SSE-NEXT: packssdw %xmm2, %xmm0 4319; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4320; SSE-NEXT: retq 4321; 4322; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 4323; AVX1: # %bb.0: 4324; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 4325; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4326; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4327; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4328; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4329; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4330; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4331; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4332; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4333; AVX1-NEXT: vzeroupper 4334; AVX1-NEXT: retq 4335; 4336; AVX2-LABEL: trunc_or_const_v8i64_v8i16: 4337; AVX2: # %bb.0: 4338; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4339; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4340; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4341; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4342; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4343; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4344; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4345; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4346; AVX2-NEXT: vzeroupper 4347; AVX2-NEXT: retq 4348; 4349; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 4350; AVX512: # %bb.0: 4351; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4352; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4353; AVX512-NEXT: vzeroupper 4354; AVX512-NEXT: retq 4355 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4356 %2 = trunc <8 x i64> %1 to <8 x i16> 4357 ret <8 x i16> %2 4358} 4359 4360define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4361; SSE-LABEL: trunc_or_const_v8i32_v8i16: 4362; SSE: # %bb.0: 4363; SSE-NEXT: pslld $16, %xmm1 4364; SSE-NEXT: psrad $16, %xmm1 4365; SSE-NEXT: pslld $16, %xmm0 4366; SSE-NEXT: psrad $16, %xmm0 4367; SSE-NEXT: packssdw %xmm1, %xmm0 4368; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4369; SSE-NEXT: retq 4370; 4371; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 4372; AVX1: # %bb.0: 4373; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4374; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4375; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4376; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4377; AVX1-NEXT: vzeroupper 4378; AVX1-NEXT: retq 4379; 4380; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 4381; AVX2: # %bb.0: 4382; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4383; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4384; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4385; AVX2-NEXT: vzeroupper 4386; AVX2-NEXT: retq 4387; 4388; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 4389; AVX512: # %bb.0: 4390; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4391; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4392; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4393; AVX512-NEXT: vzeroupper 4394; AVX512-NEXT: retq 4395 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4396 %2 = trunc <8 x i32> %1 to <8 x i16> 4397 ret <8 x i16> %2 4398} 4399 4400define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4401; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4402; SSE: # %bb.0: 4403; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4404; SSE-NEXT: pand %xmm8, %xmm7 4405; SSE-NEXT: pand %xmm8, %xmm6 4406; SSE-NEXT: packuswb %xmm7, %xmm6 4407; SSE-NEXT: pand %xmm8, %xmm5 4408; SSE-NEXT: pand %xmm8, %xmm4 4409; SSE-NEXT: packuswb %xmm5, %xmm4 4410; SSE-NEXT: packuswb %xmm6, %xmm4 4411; SSE-NEXT: pand %xmm8, %xmm3 4412; SSE-NEXT: pand %xmm8, %xmm2 4413; SSE-NEXT: packuswb %xmm3, %xmm2 4414; SSE-NEXT: pand %xmm8, %xmm1 4415; SSE-NEXT: pand %xmm8, %xmm0 4416; SSE-NEXT: packuswb %xmm1, %xmm0 4417; SSE-NEXT: packuswb %xmm2, %xmm0 4418; SSE-NEXT: packuswb %xmm4, %xmm0 4419; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4420; SSE-NEXT: retq 4421; 4422; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 4423; AVX1: # %bb.0: 4424; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] 4425; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4426; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4427; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4428; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4429; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4430; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4431; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4432; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4433; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4434; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4435; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4436; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4437; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4438; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4439; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4440; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4441; AVX1-NEXT: vzeroupper 4442; AVX1-NEXT: retq 4443; 4444; AVX2-LABEL: trunc_or_const_v16i64_v16i8: 4445; AVX2: # %bb.0: 4446; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4447; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4448; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4449; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4450; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4451; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4452; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4453; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4454; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4455; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4456; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4457; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4458; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4459; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4460; AVX2-NEXT: vzeroupper 4461; AVX2-NEXT: retq 4462; 4463; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 4464; AVX512: # %bb.0: 4465; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4466; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4467; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4468; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4469; AVX512-NEXT: vzeroupper 4470; AVX512-NEXT: retq 4471 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4472 %2 = trunc <16 x i64> %1 to <16 x i8> 4473 ret <16 x i8> %2 4474} 4475 4476define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4477; SSE-LABEL: trunc_or_const_v16i32_v16i8: 4478; SSE: # %bb.0: 4479; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4480; SSE-NEXT: pand %xmm4, %xmm3 4481; SSE-NEXT: pand %xmm4, %xmm2 4482; SSE-NEXT: packuswb %xmm3, %xmm2 4483; SSE-NEXT: pand %xmm4, %xmm1 4484; SSE-NEXT: pand %xmm4, %xmm0 4485; SSE-NEXT: packuswb %xmm1, %xmm0 4486; SSE-NEXT: packuswb %xmm2, %xmm0 4487; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4488; SSE-NEXT: retq 4489; 4490; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 4491; AVX1: # %bb.0: 4492; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4493; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4494; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4495; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4496; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4497; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4498; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4499; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4500; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4501; AVX1-NEXT: vzeroupper 4502; AVX1-NEXT: retq 4503; 4504; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 4505; AVX2: # %bb.0: 4506; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4507; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4508; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4509; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4510; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4511; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4512; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4513; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4514; AVX2-NEXT: vzeroupper 4515; AVX2-NEXT: retq 4516; 4517; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 4518; AVX512: # %bb.0: 4519; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4520; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4521; AVX512-NEXT: vzeroupper 4522; AVX512-NEXT: retq 4523 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4524 %2 = trunc <16 x i32> %1 to <16 x i8> 4525 ret <16 x i8> %2 4526} 4527 4528define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4529; SSE-LABEL: trunc_or_const_v16i16_v16i8: 4530; SSE: # %bb.0: 4531; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4532; SSE-NEXT: pand %xmm2, %xmm1 4533; SSE-NEXT: pand %xmm2, %xmm0 4534; SSE-NEXT: packuswb %xmm1, %xmm0 4535; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4536; SSE-NEXT: retq 4537; 4538; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 4539; AVX1: # %bb.0: 4540; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4541; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4542; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4543; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4544; AVX1-NEXT: vzeroupper 4545; AVX1-NEXT: retq 4546; 4547; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 4548; AVX2: # %bb.0: 4549; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4550; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4551; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4552; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4553; AVX2-NEXT: vzeroupper 4554; AVX2-NEXT: retq 4555; 4556; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 4557; AVX512F: # %bb.0: 4558; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4559; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4560; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4561; AVX512F-NEXT: vzeroupper 4562; AVX512F-NEXT: retq 4563; 4564; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 4565; AVX512BW: # %bb.0: 4566; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4567; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4568; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4569; AVX512BW-NEXT: vzeroupper 4570; AVX512BW-NEXT: retq 4571; 4572; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 4573; AVX512DQ: # %bb.0: 4574; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4575; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4576; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4577; AVX512DQ-NEXT: vzeroupper 4578; AVX512DQ-NEXT: retq 4579 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4580 %2 = trunc <16 x i16> %1 to <16 x i8> 4581 ret <16 x i8> %2 4582} 4583 4584; 4585; complex patterns - often created by vectorizer 4586; 4587 4588define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4589; SSE-LABEL: mul_add_const_v4i64_v4i32: 4590; SSE: # %bb.0: 4591; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4592; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4593; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4594; SSE-NEXT: pmuludq %xmm2, %xmm0 4595; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4596; SSE-NEXT: pmuludq %xmm3, %xmm1 4597; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4598; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4599; SSE-NEXT: retq 4600; 4601; AVX-LABEL: mul_add_const_v4i64_v4i32: 4602; AVX: # %bb.0: 4603; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4604; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4605; AVX-NEXT: retq 4606 %1 = sext <4 x i32> %a0 to <4 x i64> 4607 %2 = sext <4 x i32> %a1 to <4 x i64> 4608 %3 = mul <4 x i64> %1, %2 4609 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 4610 %5 = trunc <4 x i64> %4 to <4 x i32> 4611 ret <4 x i32> %5 4612} 4613 4614define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4615; SSE-LABEL: mul_add_self_v4i64_v4i32: 4616; SSE: # %bb.0: 4617; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4618; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4619; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4620; SSE-NEXT: pmuludq %xmm2, %xmm0 4621; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4622; SSE-NEXT: pmuludq %xmm3, %xmm1 4623; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4624; SSE-NEXT: paddd %xmm0, %xmm0 4625; SSE-NEXT: retq 4626; 4627; AVX-LABEL: mul_add_self_v4i64_v4i32: 4628; AVX: # %bb.0: 4629; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4630; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 4631; AVX-NEXT: retq 4632 %1 = sext <4 x i32> %a0 to <4 x i64> 4633 %2 = sext <4 x i32> %a1 to <4 x i64> 4634 %3 = mul <4 x i64> %1, %2 4635 %4 = add <4 x i64> %3, %3 4636 %5 = trunc <4 x i64> %4 to <4 x i32> 4637 ret <4 x i32> %5 4638} 4639 4640define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4641; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 4642; SSE: # %bb.0: 4643; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4644; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4645; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 4646; SSE-NEXT: pmuludq %xmm2, %xmm4 4647; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4648; SSE-NEXT: pmuludq %xmm3, %xmm1 4649; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] 4650; SSE-NEXT: paddd %xmm4, %xmm0 4651; SSE-NEXT: retq 4652; 4653; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 4654; AVX: # %bb.0: 4655; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 4656; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 4657; AVX-NEXT: retq 4658 %1 = sext <4 x i32> %a0 to <4 x i64> 4659 %2 = sext <4 x i32> %a1 to <4 x i64> 4660 %3 = mul <4 x i64> %1, %2 4661 %4 = add <4 x i64> %1, %3 4662 %5 = trunc <4 x i64> %4 to <4 x i32> 4663 ret <4 x i32> %5 4664} 4665