1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 16 17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { 18; SSE-LABEL: trunc8i64_8i32: 19; SSE: # %bb.0: # %entry 20; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 21; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 22; SSE-NEXT: movaps %xmm2, %xmm1 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc8i64_8i32: 26; AVX1: # %bb.0: # %entry 27; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 28; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 29; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 30; AVX1-NEXT: retq 31; 32; AVX2-SLOW-LABEL: trunc8i64_8i32: 33; AVX2-SLOW: # %bb.0: # %entry 34; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 36; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 37; AVX2-SLOW-NEXT: retq 38; 39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32: 40; AVX2-FAST-ALL: # %bb.0: # %entry 41; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 42; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 43; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 44; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 45; AVX2-FAST-ALL-NEXT: retq 46; 47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32: 48; AVX2-FAST-PERLANE: # %bb.0: # %entry 49; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 50; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 51; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 52; AVX2-FAST-PERLANE-NEXT: retq 53; 54; AVX512-LABEL: trunc8i64_8i32: 55; AVX512: # %bb.0: # %entry 56; AVX512-NEXT: vpmovqd %zmm0, %ymm0 57; AVX512-NEXT: retq 58entry: 59 %0 = trunc <8 x i64> %a to <8 x i32> 60 ret <8 x i32> %0 61} 62 63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { 64; SSE-LABEL: trunc8i64_8i32_ashr: 65; SSE: # %bb.0: # %entry 66; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 67; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 68; SSE-NEXT: movaps %xmm2, %xmm1 69; SSE-NEXT: retq 70; 71; AVX1-LABEL: trunc8i64_8i32_ashr: 72; AVX1: # %bb.0: # %entry 73; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 74; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 75; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 76; AVX1-NEXT: retq 77; 78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: 79; AVX2-SLOW: # %bb.0: # %entry 80; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 81; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 82; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 83; AVX2-SLOW-NEXT: retq 84; 85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: 86; AVX2-FAST-ALL: # %bb.0: # %entry 87; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 88; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 89; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 90; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 91; AVX2-FAST-ALL-NEXT: retq 92; 93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr: 94; AVX2-FAST-PERLANE: # %bb.0: # %entry 95; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 96; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 97; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 98; AVX2-FAST-PERLANE-NEXT: retq 99; 100; AVX512-LABEL: trunc8i64_8i32_ashr: 101; AVX512: # %bb.0: # %entry 102; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 103; AVX512-NEXT: vpmovqd %zmm0, %ymm0 104; AVX512-NEXT: retq 105entry: 106 %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 107 %1 = trunc <8 x i64> %0 to <8 x i32> 108 ret <8 x i32> %1 109} 110 111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { 112; SSE-LABEL: trunc8i64_8i32_lshr: 113; SSE: # %bb.0: # %entry 114; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 115; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 116; SSE-NEXT: movaps %xmm2, %xmm1 117; SSE-NEXT: retq 118; 119; AVX1-LABEL: trunc8i64_8i32_lshr: 120; AVX1: # %bb.0: # %entry 121; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 122; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 123; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 124; AVX1-NEXT: retq 125; 126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: 127; AVX2-SLOW: # %bb.0: # %entry 128; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 129; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 130; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 131; AVX2-SLOW-NEXT: retq 132; 133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: 134; AVX2-FAST-ALL: # %bb.0: # %entry 135; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 136; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 137; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 138; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 139; AVX2-FAST-ALL-NEXT: retq 140; 141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr: 142; AVX2-FAST-PERLANE: # %bb.0: # %entry 143; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 144; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 145; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 146; AVX2-FAST-PERLANE-NEXT: retq 147; 148; AVX512-LABEL: trunc8i64_8i32_lshr: 149; AVX512: # %bb.0: # %entry 150; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 151; AVX512-NEXT: vpmovqd %zmm0, %ymm0 152; AVX512-NEXT: retq 153entry: 154 %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 155 %1 = trunc <8 x i64> %0 to <8 x i32> 156 ret <8 x i32> %1 157} 158 159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { 160; SSE2-SSSE3-LABEL: trunc8i64_8i16: 161; SSE2-SSSE3: # %bb.0: # %entry 162; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 163; SSE2-SSSE3-NEXT: pslld $16, %xmm2 164; SSE2-SSSE3-NEXT: psrad $16, %xmm2 165; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 166; SSE2-SSSE3-NEXT: pslld $16, %xmm0 167; SSE2-SSSE3-NEXT: psrad $16, %xmm0 168; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 169; SSE2-SSSE3-NEXT: retq 170; 171; SSE41-LABEL: trunc8i64_8i16: 172; SSE41: # %bb.0: # %entry 173; SSE41-NEXT: pxor %xmm4, %xmm4 174; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 175; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 176; SSE41-NEXT: packusdw %xmm3, %xmm2 177; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 178; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 179; SSE41-NEXT: packusdw %xmm1, %xmm0 180; SSE41-NEXT: packusdw %xmm2, %xmm0 181; SSE41-NEXT: retq 182; 183; AVX1-LABEL: trunc8i64_8i16: 184; AVX1: # %bb.0: # %entry 185; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] 186; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 187; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 188; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 189; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 190; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 191; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 192; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 193; AVX1-NEXT: vzeroupper 194; AVX1-NEXT: retq 195; 196; AVX2-LABEL: trunc8i64_8i16: 197; AVX2: # %bb.0: # %entry 198; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 199; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 200; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 201; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 203; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 204; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 205; AVX2-NEXT: vzeroupper 206; AVX2-NEXT: retq 207; 208; AVX512-LABEL: trunc8i64_8i16: 209; AVX512: # %bb.0: # %entry 210; AVX512-NEXT: vpmovqw %zmm0, %xmm0 211; AVX512-NEXT: vzeroupper 212; AVX512-NEXT: retq 213entry: 214 %0 = trunc <8 x i64> %a to <8 x i16> 215 ret <8 x i16> %0 216} 217 218define void @trunc8i64_8i8(<8 x i64> %a) { 219; SSE2-SSSE3-LABEL: trunc8i64_8i8: 220; SSE2-SSSE3: # %bb.0: # %entry 221; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 222; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 223; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 224; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 225; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 226; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 227; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 228; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 229; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 230; SSE2-SSSE3-NEXT: movq %xmm0, (%rax) 231; SSE2-SSSE3-NEXT: retq 232; 233; SSE41-LABEL: trunc8i64_8i8: 234; SSE41: # %bb.0: # %entry 235; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255] 236; SSE41-NEXT: pand %xmm4, %xmm3 237; SSE41-NEXT: pand %xmm4, %xmm2 238; SSE41-NEXT: packusdw %xmm3, %xmm2 239; SSE41-NEXT: pand %xmm4, %xmm1 240; SSE41-NEXT: pand %xmm4, %xmm0 241; SSE41-NEXT: packusdw %xmm1, %xmm0 242; SSE41-NEXT: packusdw %xmm2, %xmm0 243; SSE41-NEXT: packuswb %xmm0, %xmm0 244; SSE41-NEXT: movq %xmm0, (%rax) 245; SSE41-NEXT: retq 246; 247; AVX1-LABEL: trunc8i64_8i8: 248; AVX1: # %bb.0: # %entry 249; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] 250; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 251; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 252; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 253; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 254; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 255; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 256; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 257; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 258; AVX1-NEXT: vmovq %xmm0, (%rax) 259; AVX1-NEXT: vzeroupper 260; AVX1-NEXT: retq 261; 262; AVX2-LABEL: trunc8i64_8i8: 263; AVX2: # %bb.0: # %entry 264; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 265; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 266; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 267; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 268; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 269; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 270; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 271; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 272; AVX2-NEXT: vmovq %xmm0, (%rax) 273; AVX2-NEXT: vzeroupper 274; AVX2-NEXT: retq 275; 276; AVX512-LABEL: trunc8i64_8i8: 277; AVX512: # %bb.0: # %entry 278; AVX512-NEXT: vpmovqb %zmm0, (%rax) 279; AVX512-NEXT: vzeroupper 280; AVX512-NEXT: retq 281entry: 282 %0 = trunc <8 x i64> %a to <8 x i8> 283 store <8 x i8> %0, ptr undef, align 4 284 ret void 285} 286 287define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { 288; SSE2-LABEL: trunc8i32_8i16: 289; SSE2: # %bb.0: # %entry 290; SSE2-NEXT: pslld $16, %xmm1 291; SSE2-NEXT: psrad $16, %xmm1 292; SSE2-NEXT: pslld $16, %xmm0 293; SSE2-NEXT: psrad $16, %xmm0 294; SSE2-NEXT: packssdw %xmm1, %xmm0 295; SSE2-NEXT: retq 296; 297; SSSE3-LABEL: trunc8i32_8i16: 298; SSSE3: # %bb.0: # %entry 299; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 300; SSSE3-NEXT: pshufb %xmm2, %xmm1 301; SSSE3-NEXT: pshufb %xmm2, %xmm0 302; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 303; SSSE3-NEXT: retq 304; 305; SSE41-LABEL: trunc8i32_8i16: 306; SSE41: # %bb.0: # %entry 307; SSE41-NEXT: pxor %xmm2, %xmm2 308; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 309; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 310; SSE41-NEXT: packusdw %xmm1, %xmm0 311; SSE41-NEXT: retq 312; 313; AVX1-LABEL: trunc8i32_8i16: 314; AVX1: # %bb.0: # %entry 315; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 316; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 317; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 318; AVX1-NEXT: vzeroupper 319; AVX1-NEXT: retq 320; 321; AVX2-LABEL: trunc8i32_8i16: 322; AVX2: # %bb.0: # %entry 323; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 324; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 325; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 326; AVX2-NEXT: vzeroupper 327; AVX2-NEXT: retq 328; 329; AVX512F-LABEL: trunc8i32_8i16: 330; AVX512F: # %bb.0: # %entry 331; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 332; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 333; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 334; AVX512F-NEXT: vzeroupper 335; AVX512F-NEXT: retq 336; 337; AVX512VL-LABEL: trunc8i32_8i16: 338; AVX512VL: # %bb.0: # %entry 339; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 340; AVX512VL-NEXT: vzeroupper 341; AVX512VL-NEXT: retq 342; 343; AVX512BW-LABEL: trunc8i32_8i16: 344; AVX512BW: # %bb.0: # %entry 345; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 346; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 347; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 348; AVX512BW-NEXT: vzeroupper 349; AVX512BW-NEXT: retq 350; 351; AVX512BWVL-LABEL: trunc8i32_8i16: 352; AVX512BWVL: # %bb.0: # %entry 353; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 354; AVX512BWVL-NEXT: vzeroupper 355; AVX512BWVL-NEXT: retq 356entry: 357 %0 = trunc <8 x i32> %a to <8 x i16> 358 ret <8 x i16> %0 359} 360 361define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { 362; SSE2-SSSE3-LABEL: trunc8i32_8i16_ashr: 363; SSE2-SSSE3: # %bb.0: # %entry 364; SSE2-SSSE3-NEXT: psrad $16, %xmm1 365; SSE2-SSSE3-NEXT: psrad $16, %xmm0 366; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 367; SSE2-SSSE3-NEXT: retq 368; 369; SSE41-LABEL: trunc8i32_8i16_ashr: 370; SSE41: # %bb.0: # %entry 371; SSE41-NEXT: psrld $16, %xmm1 372; SSE41-NEXT: psrld $16, %xmm0 373; SSE41-NEXT: packusdw %xmm1, %xmm0 374; SSE41-NEXT: retq 375; 376; AVX1-LABEL: trunc8i32_8i16_ashr: 377; AVX1: # %bb.0: # %entry 378; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 379; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 380; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 381; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 382; AVX1-NEXT: vzeroupper 383; AVX1-NEXT: retq 384; 385; AVX2-LABEL: trunc8i32_8i16_ashr: 386; AVX2: # %bb.0: # %entry 387; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 388; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 389; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 390; AVX2-NEXT: vzeroupper 391; AVX2-NEXT: retq 392; 393; AVX512F-LABEL: trunc8i32_8i16_ashr: 394; AVX512F: # %bb.0: # %entry 395; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 396; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 397; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 398; AVX512F-NEXT: vzeroupper 399; AVX512F-NEXT: retq 400; 401; AVX512VL-LABEL: trunc8i32_8i16_ashr: 402; AVX512VL: # %bb.0: # %entry 403; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 404; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 405; AVX512VL-NEXT: vzeroupper 406; AVX512VL-NEXT: retq 407; 408; AVX512BW-LABEL: trunc8i32_8i16_ashr: 409; AVX512BW: # %bb.0: # %entry 410; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 411; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 412; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 413; AVX512BW-NEXT: vzeroupper 414; AVX512BW-NEXT: retq 415; 416; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: 417; AVX512BWVL: # %bb.0: # %entry 418; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 419; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 420; AVX512BWVL-NEXT: vzeroupper 421; AVX512BWVL-NEXT: retq 422entry: 423 %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 424 %1 = trunc <8 x i32> %0 to <8 x i16> 425 ret <8 x i16> %1 426} 427 428define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { 429; SSE2-SSSE3-LABEL: trunc8i32_8i16_lshr: 430; SSE2-SSSE3: # %bb.0: # %entry 431; SSE2-SSSE3-NEXT: psrad $16, %xmm1 432; SSE2-SSSE3-NEXT: psrad $16, %xmm0 433; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 434; SSE2-SSSE3-NEXT: retq 435; 436; SSE41-LABEL: trunc8i32_8i16_lshr: 437; SSE41: # %bb.0: # %entry 438; SSE41-NEXT: psrld $16, %xmm1 439; SSE41-NEXT: psrld $16, %xmm0 440; SSE41-NEXT: packusdw %xmm1, %xmm0 441; SSE41-NEXT: retq 442; 443; AVX1-LABEL: trunc8i32_8i16_lshr: 444; AVX1: # %bb.0: # %entry 445; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 446; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 447; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 448; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 449; AVX1-NEXT: vzeroupper 450; AVX1-NEXT: retq 451; 452; AVX2-LABEL: trunc8i32_8i16_lshr: 453; AVX2: # %bb.0: # %entry 454; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 455; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 456; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 457; AVX2-NEXT: vzeroupper 458; AVX2-NEXT: retq 459; 460; AVX512F-LABEL: trunc8i32_8i16_lshr: 461; AVX512F: # %bb.0: # %entry 462; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 463; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 464; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 465; AVX512F-NEXT: vzeroupper 466; AVX512F-NEXT: retq 467; 468; AVX512VL-LABEL: trunc8i32_8i16_lshr: 469; AVX512VL: # %bb.0: # %entry 470; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 471; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 472; AVX512VL-NEXT: vzeroupper 473; AVX512VL-NEXT: retq 474; 475; AVX512BW-LABEL: trunc8i32_8i16_lshr: 476; AVX512BW: # %bb.0: # %entry 477; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 478; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 479; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 480; AVX512BW-NEXT: vzeroupper 481; AVX512BW-NEXT: retq 482; 483; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: 484; AVX512BWVL: # %bb.0: # %entry 485; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 486; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 487; AVX512BWVL-NEXT: vzeroupper 488; AVX512BWVL-NEXT: retq 489entry: 490 %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 491 %1 = trunc <8 x i32> %0 to <8 x i16> 492 ret <8 x i16> %1 493} 494 495define void @trunc8i32_8i8(<8 x i32> %a) { 496; SSE2-SSSE3-LABEL: trunc8i32_8i8: 497; SSE2-SSSE3: # %bb.0: # %entry 498; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 499; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 500; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 501; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 502; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 503; SSE2-SSSE3-NEXT: movq %xmm0, (%rax) 504; SSE2-SSSE3-NEXT: retq 505; 506; SSE41-LABEL: trunc8i32_8i8: 507; SSE41: # %bb.0: # %entry 508; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] 509; SSE41-NEXT: pand %xmm2, %xmm1 510; SSE41-NEXT: pand %xmm2, %xmm0 511; SSE41-NEXT: packusdw %xmm1, %xmm0 512; SSE41-NEXT: packuswb %xmm0, %xmm0 513; SSE41-NEXT: movq %xmm0, (%rax) 514; SSE41-NEXT: retq 515; 516; AVX1-LABEL: trunc8i32_8i8: 517; AVX1: # %bb.0: # %entry 518; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 519; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] 520; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 521; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 522; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 523; AVX1-NEXT: vmovq %xmm0, (%rax) 524; AVX1-NEXT: vzeroupper 525; AVX1-NEXT: retq 526; 527; AVX2-LABEL: trunc8i32_8i8: 528; AVX2: # %bb.0: # %entry 529; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 530; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] 531; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 532; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 533; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 534; AVX2-NEXT: vmovq %xmm0, (%rax) 535; AVX2-NEXT: vzeroupper 536; AVX2-NEXT: retq 537; 538; AVX512F-LABEL: trunc8i32_8i8: 539; AVX512F: # %bb.0: # %entry 540; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 541; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 542; AVX512F-NEXT: vmovq %xmm0, (%rax) 543; AVX512F-NEXT: vzeroupper 544; AVX512F-NEXT: retq 545; 546; AVX512VL-LABEL: trunc8i32_8i8: 547; AVX512VL: # %bb.0: # %entry 548; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) 549; AVX512VL-NEXT: vzeroupper 550; AVX512VL-NEXT: retq 551; 552; AVX512BW-LABEL: trunc8i32_8i8: 553; AVX512BW: # %bb.0: # %entry 554; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 555; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 556; AVX512BW-NEXT: vmovq %xmm0, (%rax) 557; AVX512BW-NEXT: vzeroupper 558; AVX512BW-NEXT: retq 559; 560; AVX512BWVL-LABEL: trunc8i32_8i8: 561; AVX512BWVL: # %bb.0: # %entry 562; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) 563; AVX512BWVL-NEXT: vzeroupper 564; AVX512BWVL-NEXT: retq 565entry: 566 %0 = trunc <8 x i32> %a to <8 x i8> 567 store <8 x i8> %0, ptr undef, align 4 568 ret void 569} 570 571define void @trunc16i32_16i16(<16 x i32> %a) { 572; SSE2-LABEL: trunc16i32_16i16: 573; SSE2: # %bb.0: # %entry 574; SSE2-NEXT: pslld $16, %xmm1 575; SSE2-NEXT: psrad $16, %xmm1 576; SSE2-NEXT: pslld $16, %xmm0 577; SSE2-NEXT: psrad $16, %xmm0 578; SSE2-NEXT: packssdw %xmm1, %xmm0 579; SSE2-NEXT: pslld $16, %xmm3 580; SSE2-NEXT: psrad $16, %xmm3 581; SSE2-NEXT: pslld $16, %xmm2 582; SSE2-NEXT: psrad $16, %xmm2 583; SSE2-NEXT: packssdw %xmm3, %xmm2 584; SSE2-NEXT: movdqu %xmm2, (%rax) 585; SSE2-NEXT: movdqu %xmm0, (%rax) 586; SSE2-NEXT: retq 587; 588; SSSE3-LABEL: trunc16i32_16i16: 589; SSSE3: # %bb.0: # %entry 590; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 591; SSSE3-NEXT: pshufb %xmm4, %xmm1 592; SSSE3-NEXT: pshufb %xmm4, %xmm0 593; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 594; SSSE3-NEXT: pshufb %xmm4, %xmm3 595; SSSE3-NEXT: pshufb %xmm4, %xmm2 596; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 597; SSSE3-NEXT: movdqu %xmm2, (%rax) 598; SSSE3-NEXT: movdqu %xmm0, (%rax) 599; SSSE3-NEXT: retq 600; 601; SSE41-LABEL: trunc16i32_16i16: 602; SSE41: # %bb.0: # %entry 603; SSE41-NEXT: pxor %xmm4, %xmm4 604; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 605; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 606; SSE41-NEXT: packusdw %xmm1, %xmm0 607; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 608; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 609; SSE41-NEXT: packusdw %xmm3, %xmm2 610; SSE41-NEXT: movdqu %xmm2, (%rax) 611; SSE41-NEXT: movdqu %xmm0, (%rax) 612; SSE41-NEXT: retq 613; 614; AVX1-LABEL: trunc16i32_16i16: 615; AVX1: # %bb.0: # %entry 616; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 617; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 618; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 619; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 620; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 621; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 622; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 623; AVX1-NEXT: vmovdqu %xmm1, (%rax) 624; AVX1-NEXT: vmovdqu %xmm0, (%rax) 625; AVX1-NEXT: vzeroupper 626; AVX1-NEXT: retq 627; 628; AVX2-LABEL: trunc16i32_16i16: 629; AVX2: # %bb.0: # %entry 630; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 631; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 632; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 633; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 634; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 635; AVX2-NEXT: vmovdqu %ymm0, (%rax) 636; AVX2-NEXT: vzeroupper 637; AVX2-NEXT: retq 638; 639; AVX512-LABEL: trunc16i32_16i16: 640; AVX512: # %bb.0: # %entry 641; AVX512-NEXT: vpmovdw %zmm0, (%rax) 642; AVX512-NEXT: vzeroupper 643; AVX512-NEXT: retq 644entry: 645 %0 = trunc <16 x i32> %a to <16 x i16> 646 store <16 x i16> %0, ptr undef, align 4 647 ret void 648} 649 650define void @trunc16i32_16i16_ashr(<16 x i32> %a) { 651; SSE2-SSSE3-LABEL: trunc16i32_16i16_ashr: 652; SSE2-SSSE3: # %bb.0: # %entry 653; SSE2-SSSE3-NEXT: psrad $16, %xmm1 654; SSE2-SSSE3-NEXT: psrad $16, %xmm0 655; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 656; SSE2-SSSE3-NEXT: psrad $16, %xmm3 657; SSE2-SSSE3-NEXT: psrad $16, %xmm2 658; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 659; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax) 660; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 661; SSE2-SSSE3-NEXT: retq 662; 663; SSE41-LABEL: trunc16i32_16i16_ashr: 664; SSE41: # %bb.0: # %entry 665; SSE41-NEXT: psrld $16, %xmm3 666; SSE41-NEXT: psrld $16, %xmm2 667; SSE41-NEXT: packusdw %xmm3, %xmm2 668; SSE41-NEXT: psrld $16, %xmm1 669; SSE41-NEXT: psrld $16, %xmm0 670; SSE41-NEXT: packusdw %xmm1, %xmm0 671; SSE41-NEXT: movdqu %xmm2, (%rax) 672; SSE41-NEXT: movdqu %xmm0, (%rax) 673; SSE41-NEXT: retq 674; 675; AVX1-LABEL: trunc16i32_16i16_ashr: 676; AVX1: # %bb.0: # %entry 677; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 678; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 679; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 680; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 681; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 682; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 683; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 684; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 685; AVX1-NEXT: vmovdqu %xmm1, (%rax) 686; AVX1-NEXT: vmovdqu %xmm0, (%rax) 687; AVX1-NEXT: vzeroupper 688; AVX1-NEXT: retq 689; 690; AVX2-LABEL: trunc16i32_16i16_ashr: 691; AVX2: # %bb.0: # %entry 692; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 693; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 694; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 695; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 696; AVX2-NEXT: vmovdqu %ymm0, (%rax) 697; AVX2-NEXT: vzeroupper 698; AVX2-NEXT: retq 699; 700; AVX512-LABEL: trunc16i32_16i16_ashr: 701; AVX512: # %bb.0: # %entry 702; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 703; AVX512-NEXT: vpmovdw %zmm0, (%rax) 704; AVX512-NEXT: vzeroupper 705; AVX512-NEXT: retq 706entry: 707 %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 708 %1 = trunc <16 x i32> %0 to <16 x i16> 709 store <16 x i16> %1, ptr undef, align 4 710 ret void 711} 712 713define void @trunc16i32_16i16_lshr(<16 x i32> %a) { 714; SSE2-SSSE3-LABEL: trunc16i32_16i16_lshr: 715; SSE2-SSSE3: # %bb.0: # %entry 716; SSE2-SSSE3-NEXT: psrad $16, %xmm1 717; SSE2-SSSE3-NEXT: psrad $16, %xmm0 718; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 719; SSE2-SSSE3-NEXT: psrad $16, %xmm3 720; SSE2-SSSE3-NEXT: psrad $16, %xmm2 721; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 722; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax) 723; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 724; SSE2-SSSE3-NEXT: retq 725; 726; SSE41-LABEL: trunc16i32_16i16_lshr: 727; SSE41: # %bb.0: # %entry 728; SSE41-NEXT: psrld $16, %xmm3 729; SSE41-NEXT: psrld $16, %xmm2 730; SSE41-NEXT: packusdw %xmm3, %xmm2 731; SSE41-NEXT: psrld $16, %xmm1 732; SSE41-NEXT: psrld $16, %xmm0 733; SSE41-NEXT: packusdw %xmm1, %xmm0 734; SSE41-NEXT: movdqu %xmm2, (%rax) 735; SSE41-NEXT: movdqu %xmm0, (%rax) 736; SSE41-NEXT: retq 737; 738; AVX1-LABEL: trunc16i32_16i16_lshr: 739; AVX1: # %bb.0: # %entry 740; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 741; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 742; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 743; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 744; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 745; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 746; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 747; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 748; AVX1-NEXT: vmovdqu %xmm1, (%rax) 749; AVX1-NEXT: vmovdqu %xmm0, (%rax) 750; AVX1-NEXT: vzeroupper 751; AVX1-NEXT: retq 752; 753; AVX2-LABEL: trunc16i32_16i16_lshr: 754; AVX2: # %bb.0: # %entry 755; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 756; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 757; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 758; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 759; AVX2-NEXT: vmovdqu %ymm0, (%rax) 760; AVX2-NEXT: vzeroupper 761; AVX2-NEXT: retq 762; 763; AVX512-LABEL: trunc16i32_16i16_lshr: 764; AVX512: # %bb.0: # %entry 765; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 766; AVX512-NEXT: vpmovdw %zmm0, (%rax) 767; AVX512-NEXT: vzeroupper 768; AVX512-NEXT: retq 769entry: 770 %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 771 %1 = trunc <16 x i32> %0 to <16 x i16> 772 store <16 x i16> %1, ptr undef, align 4 773 ret void 774} 775 776define void @trunc16i32_16i8(<16 x i32> %a) { 777; SSE2-SSSE3-LABEL: trunc16i32_16i8: 778; SSE2-SSSE3: # %bb.0: # %entry 779; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 780; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 781; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 782; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 783; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 784; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 785; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 786; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 787; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 788; SSE2-SSSE3-NEXT: retq 789; 790; SSE41-LABEL: trunc16i32_16i8: 791; SSE41: # %bb.0: # %entry 792; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] 793; SSE41-NEXT: pand %xmm4, %xmm3 794; SSE41-NEXT: pand %xmm4, %xmm2 795; SSE41-NEXT: packusdw %xmm3, %xmm2 796; SSE41-NEXT: pand %xmm4, %xmm1 797; SSE41-NEXT: pand %xmm4, %xmm0 798; SSE41-NEXT: packusdw %xmm1, %xmm0 799; SSE41-NEXT: packuswb %xmm2, %xmm0 800; SSE41-NEXT: movdqu %xmm0, (%rax) 801; SSE41-NEXT: retq 802; 803; AVX1-LABEL: trunc16i32_16i8: 804; AVX1: # %bb.0: # %entry 805; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 806; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 807; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 808; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 809; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 810; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 811; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 812; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 813; AVX1-NEXT: vmovdqu %xmm0, (%rax) 814; AVX1-NEXT: vzeroupper 815; AVX1-NEXT: retq 816; 817; AVX2-LABEL: trunc16i32_16i8: 818; AVX2: # %bb.0: # %entry 819; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 820; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 821; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 822; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 823; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 824; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 825; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 826; AVX2-NEXT: vmovdqu %xmm0, (%rax) 827; AVX2-NEXT: vzeroupper 828; AVX2-NEXT: retq 829; 830; AVX512-LABEL: trunc16i32_16i8: 831; AVX512: # %bb.0: # %entry 832; AVX512-NEXT: vpmovdb %zmm0, (%rax) 833; AVX512-NEXT: vzeroupper 834; AVX512-NEXT: retq 835entry: 836 %0 = trunc <16 x i32> %a to <16 x i8> 837 store <16 x i8> %0, ptr undef, align 4 838 ret void 839} 840 841define void @trunc16i32_16i8_ashr(<16 x i32> %a) { 842; SSE2-SSSE3-LABEL: trunc16i32_16i8_ashr: 843; SSE2-SSSE3: # %bb.0: # %entry 844; SSE2-SSSE3-NEXT: psrld $24, %xmm1 845; SSE2-SSSE3-NEXT: psrld $24, %xmm0 846; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 847; SSE2-SSSE3-NEXT: psrld $24, %xmm3 848; SSE2-SSSE3-NEXT: psrld $24, %xmm2 849; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 850; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 851; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 852; SSE2-SSSE3-NEXT: retq 853; 854; SSE41-LABEL: trunc16i32_16i8_ashr: 855; SSE41: # %bb.0: # %entry 856; SSE41-NEXT: psrld $24, %xmm1 857; SSE41-NEXT: psrld $24, %xmm0 858; SSE41-NEXT: packusdw %xmm1, %xmm0 859; SSE41-NEXT: psrld $24, %xmm3 860; SSE41-NEXT: psrld $24, %xmm2 861; SSE41-NEXT: packusdw %xmm3, %xmm2 862; SSE41-NEXT: packuswb %xmm2, %xmm0 863; SSE41-NEXT: movdqu %xmm0, (%rax) 864; SSE41-NEXT: retq 865; 866; AVX1-LABEL: trunc16i32_16i8_ashr: 867; AVX1: # %bb.0: # %entry 868; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 869; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 870; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 871; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 872; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 873; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 874; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 875; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 876; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 877; AVX1-NEXT: vmovdqu %xmm0, (%rax) 878; AVX1-NEXT: vzeroupper 879; AVX1-NEXT: retq 880; 881; AVX2-LABEL: trunc16i32_16i8_ashr: 882; AVX2: # %bb.0: # %entry 883; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 884; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 885; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 886; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 887; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 888; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 889; AVX2-NEXT: vmovdqu %xmm0, (%rax) 890; AVX2-NEXT: vzeroupper 891; AVX2-NEXT: retq 892; 893; AVX512-LABEL: trunc16i32_16i8_ashr: 894; AVX512: # %bb.0: # %entry 895; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 896; AVX512-NEXT: vpmovdb %zmm0, (%rax) 897; AVX512-NEXT: vzeroupper 898; AVX512-NEXT: retq 899entry: 900 %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 901 %1 = trunc <16 x i32> %0 to <16 x i8> 902 store <16 x i8> %1, ptr undef, align 4 903 ret void 904} 905 906define void @trunc16i32_16i8_lshr(<16 x i32> %a) { 907; SSE2-SSSE3-LABEL: trunc16i32_16i8_lshr: 908; SSE2-SSSE3: # %bb.0: # %entry 909; SSE2-SSSE3-NEXT: psrld $24, %xmm1 910; SSE2-SSSE3-NEXT: psrld $24, %xmm0 911; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 912; SSE2-SSSE3-NEXT: psrld $24, %xmm3 913; SSE2-SSSE3-NEXT: psrld $24, %xmm2 914; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 915; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 916; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 917; SSE2-SSSE3-NEXT: retq 918; 919; SSE41-LABEL: trunc16i32_16i8_lshr: 920; SSE41: # %bb.0: # %entry 921; SSE41-NEXT: psrld $24, %xmm1 922; SSE41-NEXT: psrld $24, %xmm0 923; SSE41-NEXT: packusdw %xmm1, %xmm0 924; SSE41-NEXT: psrld $24, %xmm3 925; SSE41-NEXT: psrld $24, %xmm2 926; SSE41-NEXT: packusdw %xmm3, %xmm2 927; SSE41-NEXT: packuswb %xmm2, %xmm0 928; SSE41-NEXT: movdqu %xmm0, (%rax) 929; SSE41-NEXT: retq 930; 931; AVX1-LABEL: trunc16i32_16i8_lshr: 932; AVX1: # %bb.0: # %entry 933; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 934; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 935; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 936; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 937; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 938; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 939; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 940; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 941; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 942; AVX1-NEXT: vmovdqu %xmm0, (%rax) 943; AVX1-NEXT: vzeroupper 944; AVX1-NEXT: retq 945; 946; AVX2-LABEL: trunc16i32_16i8_lshr: 947; AVX2: # %bb.0: # %entry 948; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 949; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 950; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 951; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 952; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 953; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 954; AVX2-NEXT: vmovdqu %xmm0, (%rax) 955; AVX2-NEXT: vzeroupper 956; AVX2-NEXT: retq 957; 958; AVX512-LABEL: trunc16i32_16i8_lshr: 959; AVX512: # %bb.0: # %entry 960; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 961; AVX512-NEXT: vpmovdb %zmm0, (%rax) 962; AVX512-NEXT: vzeroupper 963; AVX512-NEXT: retq 964entry: 965 %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 966 %1 = trunc <16 x i32> %0 to <16 x i8> 967 store <16 x i8> %1, ptr undef, align 4 968 ret void 969} 970 971;PR25684 972define void @trunc16i16_16i8(<16 x i16> %a) { 973; SSE2-SSSE3-LABEL: trunc16i16_16i8: 974; SSE2-SSSE3: # %bb.0: # %entry 975; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 976; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 977; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 978; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 979; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 980; SSE2-SSSE3-NEXT: retq 981; 982; SSE41-LABEL: trunc16i16_16i8: 983; SSE41: # %bb.0: # %entry 984; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 985; SSE41-NEXT: pand %xmm2, %xmm1 986; SSE41-NEXT: pand %xmm2, %xmm0 987; SSE41-NEXT: packuswb %xmm1, %xmm0 988; SSE41-NEXT: movdqu %xmm0, (%rax) 989; SSE41-NEXT: retq 990; 991; AVX1-LABEL: trunc16i16_16i8: 992; AVX1: # %bb.0: # %entry 993; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 994; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 995; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 996; AVX1-NEXT: vmovdqu %xmm0, (%rax) 997; AVX1-NEXT: vzeroupper 998; AVX1-NEXT: retq 999; 1000; AVX2-LABEL: trunc16i16_16i8: 1001; AVX2: # %bb.0: # %entry 1002; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1003; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1004; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1005; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1006; AVX2-NEXT: vzeroupper 1007; AVX2-NEXT: retq 1008; 1009; AVX512F-LABEL: trunc16i16_16i8: 1010; AVX512F: # %bb.0: # %entry 1011; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1012; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1013; AVX512F-NEXT: vzeroupper 1014; AVX512F-NEXT: retq 1015; 1016; AVX512VL-LABEL: trunc16i16_16i8: 1017; AVX512VL: # %bb.0: # %entry 1018; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1019; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1020; AVX512VL-NEXT: vzeroupper 1021; AVX512VL-NEXT: retq 1022; 1023; AVX512BW-LABEL: trunc16i16_16i8: 1024; AVX512BW: # %bb.0: # %entry 1025; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1026; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1027; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1028; AVX512BW-NEXT: vzeroupper 1029; AVX512BW-NEXT: retq 1030; 1031; AVX512BWVL-LABEL: trunc16i16_16i8: 1032; AVX512BWVL: # %bb.0: # %entry 1033; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1034; AVX512BWVL-NEXT: vzeroupper 1035; AVX512BWVL-NEXT: retq 1036entry: 1037 %0 = trunc <16 x i16> %a to <16 x i8> 1038 store <16 x i8> %0, ptr undef, align 4 1039 ret void 1040} 1041 1042define void @trunc16i16_16i8_ashr(<16 x i16> %a) { 1043; SSE-LABEL: trunc16i16_16i8_ashr: 1044; SSE: # %bb.0: # %entry 1045; SSE-NEXT: psrlw $8, %xmm1 1046; SSE-NEXT: psrlw $8, %xmm0 1047; SSE-NEXT: packuswb %xmm1, %xmm0 1048; SSE-NEXT: movdqu %xmm0, (%rax) 1049; SSE-NEXT: retq 1050; 1051; AVX1-LABEL: trunc16i16_16i8_ashr: 1052; AVX1: # %bb.0: # %entry 1053; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1054; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1055; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1056; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1057; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1058; AVX1-NEXT: vzeroupper 1059; AVX1-NEXT: retq 1060; 1061; AVX2-LABEL: trunc16i16_16i8_ashr: 1062; AVX2: # %bb.0: # %entry 1063; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1064; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1065; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1066; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1067; AVX2-NEXT: vzeroupper 1068; AVX2-NEXT: retq 1069; 1070; AVX512F-LABEL: trunc16i16_16i8_ashr: 1071; AVX512F: # %bb.0: # %entry 1072; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1073; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1074; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1075; AVX512F-NEXT: vzeroupper 1076; AVX512F-NEXT: retq 1077; 1078; AVX512VL-LABEL: trunc16i16_16i8_ashr: 1079; AVX512VL: # %bb.0: # %entry 1080; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1081; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1082; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1083; AVX512VL-NEXT: vzeroupper 1084; AVX512VL-NEXT: retq 1085; 1086; AVX512BW-LABEL: trunc16i16_16i8_ashr: 1087; AVX512BW: # %bb.0: # %entry 1088; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1089; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1090; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1091; AVX512BW-NEXT: vzeroupper 1092; AVX512BW-NEXT: retq 1093; 1094; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: 1095; AVX512BWVL: # %bb.0: # %entry 1096; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1097; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1098; AVX512BWVL-NEXT: vzeroupper 1099; AVX512BWVL-NEXT: retq 1100entry: 1101 %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1102 %1 = trunc <16 x i16> %0 to <16 x i8> 1103 store <16 x i8> %1, ptr undef, align 4 1104 ret void 1105} 1106 1107define void @trunc16i16_16i8_lshr(<16 x i16> %a) { 1108; SSE-LABEL: trunc16i16_16i8_lshr: 1109; SSE: # %bb.0: # %entry 1110; SSE-NEXT: psrlw $8, %xmm1 1111; SSE-NEXT: psrlw $8, %xmm0 1112; SSE-NEXT: packuswb %xmm1, %xmm0 1113; SSE-NEXT: movdqu %xmm0, (%rax) 1114; SSE-NEXT: retq 1115; 1116; AVX1-LABEL: trunc16i16_16i8_lshr: 1117; AVX1: # %bb.0: # %entry 1118; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1119; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1120; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1121; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1122; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1123; AVX1-NEXT: vzeroupper 1124; AVX1-NEXT: retq 1125; 1126; AVX2-LABEL: trunc16i16_16i8_lshr: 1127; AVX2: # %bb.0: # %entry 1128; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1129; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1130; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1131; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1132; AVX2-NEXT: vzeroupper 1133; AVX2-NEXT: retq 1134; 1135; AVX512F-LABEL: trunc16i16_16i8_lshr: 1136; AVX512F: # %bb.0: # %entry 1137; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1138; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1139; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1140; AVX512F-NEXT: vzeroupper 1141; AVX512F-NEXT: retq 1142; 1143; AVX512VL-LABEL: trunc16i16_16i8_lshr: 1144; AVX512VL: # %bb.0: # %entry 1145; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1146; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1147; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1148; AVX512VL-NEXT: vzeroupper 1149; AVX512VL-NEXT: retq 1150; 1151; AVX512BW-LABEL: trunc16i16_16i8_lshr: 1152; AVX512BW: # %bb.0: # %entry 1153; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1154; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1155; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1156; AVX512BW-NEXT: vzeroupper 1157; AVX512BW-NEXT: retq 1158; 1159; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: 1160; AVX512BWVL: # %bb.0: # %entry 1161; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1162; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1163; AVX512BWVL-NEXT: vzeroupper 1164; AVX512BWVL-NEXT: retq 1165entry: 1166 %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1167 %1 = trunc <16 x i16> %0 to <16 x i8> 1168 store <16 x i8> %1, ptr undef, align 4 1169 ret void 1170} 1171 1172define void @trunc32i16_32i8(<32 x i16> %a) { 1173; SSE2-SSSE3-LABEL: trunc32i16_32i8: 1174; SSE2-SSSE3: # %bb.0: # %entry 1175; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1176; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 1177; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 1178; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 1179; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 1180; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 1181; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 1182; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax) 1183; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) 1184; SSE2-SSSE3-NEXT: retq 1185; 1186; SSE41-LABEL: trunc32i16_32i8: 1187; SSE41: # %bb.0: # %entry 1188; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1189; SSE41-NEXT: pand %xmm4, %xmm1 1190; SSE41-NEXT: pand %xmm4, %xmm0 1191; SSE41-NEXT: packuswb %xmm1, %xmm0 1192; SSE41-NEXT: pand %xmm4, %xmm3 1193; SSE41-NEXT: pand %xmm4, %xmm2 1194; SSE41-NEXT: packuswb %xmm3, %xmm2 1195; SSE41-NEXT: movdqu %xmm2, (%rax) 1196; SSE41-NEXT: movdqu %xmm0, (%rax) 1197; SSE41-NEXT: retq 1198; 1199; AVX1-LABEL: trunc32i16_32i8: 1200; AVX1: # %bb.0: # %entry 1201; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1202; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1203; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1204; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1205; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1206; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1207; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1208; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1209; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1210; AVX1-NEXT: vzeroupper 1211; AVX1-NEXT: retq 1212; 1213; AVX2-LABEL: trunc32i16_32i8: 1214; AVX2: # %bb.0: # %entry 1215; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1216; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1217; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1218; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1219; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1220; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1221; AVX2-NEXT: vzeroupper 1222; AVX2-NEXT: retq 1223; 1224; AVX512F-LABEL: trunc32i16_32i8: 1225; AVX512F: # %bb.0: # %entry 1226; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1227; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1228; AVX512F-NEXT: vpmovdb %zmm1, (%rax) 1229; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1230; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1231; AVX512F-NEXT: vzeroupper 1232; AVX512F-NEXT: retq 1233; 1234; AVX512VL-LABEL: trunc32i16_32i8: 1235; AVX512VL: # %bb.0: # %entry 1236; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1237; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1238; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) 1239; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1240; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1241; AVX512VL-NEXT: vzeroupper 1242; AVX512VL-NEXT: retq 1243; 1244; AVX512BW-LABEL: trunc32i16_32i8: 1245; AVX512BW: # %bb.0: # %entry 1246; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) 1247; AVX512BW-NEXT: vzeroupper 1248; AVX512BW-NEXT: retq 1249; 1250; AVX512BWVL-LABEL: trunc32i16_32i8: 1251; AVX512BWVL: # %bb.0: # %entry 1252; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) 1253; AVX512BWVL-NEXT: vzeroupper 1254; AVX512BWVL-NEXT: retq 1255entry: 1256 %0 = trunc <32 x i16> %a to <32 x i8> 1257 store <32 x i8> %0, ptr undef, align 4 1258 ret void 1259} 1260 1261define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { 1262; SSE-LABEL: trunc2x4i64_8i32: 1263; SSE: # %bb.0: # %entry 1264; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1265; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1266; SSE-NEXT: movaps %xmm2, %xmm1 1267; SSE-NEXT: retq 1268; 1269; AVX1-LABEL: trunc2x4i64_8i32: 1270; AVX1: # %bb.0: # %entry 1271; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1272; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1273; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1274; AVX1-NEXT: retq 1275; 1276; AVX2-SLOW-LABEL: trunc2x4i64_8i32: 1277; AVX2-SLOW: # %bb.0: # %entry 1278; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1279; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1280; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1281; AVX2-SLOW-NEXT: retq 1282; 1283; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32: 1284; AVX2-FAST-ALL: # %bb.0: # %entry 1285; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1286; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 1287; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 1288; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1289; AVX2-FAST-ALL-NEXT: retq 1290; 1291; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32: 1292; AVX2-FAST-PERLANE: # %bb.0: # %entry 1293; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1294; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1295; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1296; AVX2-FAST-PERLANE-NEXT: retq 1297; 1298; AVX512-LABEL: trunc2x4i64_8i32: 1299; AVX512: # %bb.0: # %entry 1300; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1301; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1302; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1303; AVX512-NEXT: retq 1304entry: 1305 %0 = trunc <4 x i64> %a to <4 x i32> 1306 %1 = trunc <4 x i64> %b to <4 x i32> 1307 %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1308 ret <8 x i32> %2 1309} 1310 1311define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { 1312; SSE2-SSSE3-LABEL: trunc2x4i64_8i16: 1313; SSE2-SSSE3: # %bb.0: # %entry 1314; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1315; SSE2-SSSE3-NEXT: pslld $16, %xmm0 1316; SSE2-SSSE3-NEXT: psrad $16, %xmm0 1317; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1318; SSE2-SSSE3-NEXT: pslld $16, %xmm2 1319; SSE2-SSSE3-NEXT: psrad $16, %xmm2 1320; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 1321; SSE2-SSSE3-NEXT: retq 1322; 1323; SSE41-LABEL: trunc2x4i64_8i16: 1324; SSE41: # %bb.0: # %entry 1325; SSE41-NEXT: pxor %xmm4, %xmm4 1326; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 1327; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 1328; SSE41-NEXT: packusdw %xmm1, %xmm0 1329; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 1330; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 1331; SSE41-NEXT: packusdw %xmm3, %xmm2 1332; SSE41-NEXT: packusdw %xmm2, %xmm0 1333; SSE41-NEXT: retq 1334; 1335; AVX1-LABEL: trunc2x4i64_8i16: 1336; AVX1: # %bb.0: # %entry 1337; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1338; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1339; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1340; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 1341; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1342; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1343; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1344; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 1345; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1346; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1347; AVX1-NEXT: vzeroupper 1348; AVX1-NEXT: retq 1349; 1350; AVX2-LABEL: trunc2x4i64_8i16: 1351; AVX2: # %bb.0: # %entry 1352; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1353; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1354; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1355; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1356; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1357; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1358; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1359; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1360; AVX2-NEXT: vzeroupper 1361; AVX2-NEXT: retq 1362; 1363; AVX512F-LABEL: trunc2x4i64_8i16: 1364; AVX512F: # %bb.0: # %entry 1365; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1366; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1367; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1368; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1369; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1370; AVX512F-NEXT: vzeroupper 1371; AVX512F-NEXT: retq 1372; 1373; AVX512VL-LABEL: trunc2x4i64_8i16: 1374; AVX512VL: # %bb.0: # %entry 1375; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 1376; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 1377; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1378; AVX512VL-NEXT: vzeroupper 1379; AVX512VL-NEXT: retq 1380; 1381; AVX512BW-LABEL: trunc2x4i64_8i16: 1382; AVX512BW: # %bb.0: # %entry 1383; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1384; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1385; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1386; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1387; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1388; AVX512BW-NEXT: vzeroupper 1389; AVX512BW-NEXT: retq 1390; 1391; AVX512BWVL-LABEL: trunc2x4i64_8i16: 1392; AVX512BWVL: # %bb.0: # %entry 1393; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 1394; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 1395; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1396; AVX512BWVL-NEXT: vzeroupper 1397; AVX512BWVL-NEXT: retq 1398entry: 1399 %0 = trunc <4 x i64> %a to <4 x i16> 1400 %1 = trunc <4 x i64> %b to <4 x i16> 1401 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1402 ret <8 x i16> %2 1403} 1404 1405define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { 1406; SSE-LABEL: trunc2x2i64_4i32: 1407; SSE: # %bb.0: # %entry 1408; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1409; SSE-NEXT: retq 1410; 1411; AVX-LABEL: trunc2x2i64_4i32: 1412; AVX: # %bb.0: # %entry 1413; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1414; AVX-NEXT: retq 1415; 1416; AVX512F-LABEL: trunc2x2i64_4i32: 1417; AVX512F: # %bb.0: # %entry 1418; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1419; AVX512F-NEXT: retq 1420; 1421; AVX512VL-LABEL: trunc2x2i64_4i32: 1422; AVX512VL: # %bb.0: # %entry 1423; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1424; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1425; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1426; AVX512VL-NEXT: vzeroupper 1427; AVX512VL-NEXT: retq 1428; 1429; AVX512BW-LABEL: trunc2x2i64_4i32: 1430; AVX512BW: # %bb.0: # %entry 1431; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1432; AVX512BW-NEXT: retq 1433; 1434; AVX512BWVL-LABEL: trunc2x2i64_4i32: 1435; AVX512BWVL: # %bb.0: # %entry 1436; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1437; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1438; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1439; AVX512BWVL-NEXT: vzeroupper 1440; AVX512BWVL-NEXT: retq 1441entry: 1442 %0 = trunc <2 x i64> %a to <2 x i32> 1443 %1 = trunc <2 x i64> %b to <2 x i32> 1444 %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1445 ret <4 x i32> %2 1446} 1447 1448define i64 @trunc2i64_i64(<2 x i64> %inval) { 1449; SSE-LABEL: trunc2i64_i64: 1450; SSE: # %bb.0: # %entry 1451; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1452; SSE-NEXT: movq %xmm0, %rax 1453; SSE-NEXT: retq 1454; 1455; AVX-LABEL: trunc2i64_i64: 1456; AVX: # %bb.0: # %entry 1457; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1458; AVX-NEXT: vmovq %xmm0, %rax 1459; AVX-NEXT: retq 1460; 1461; AVX512-LABEL: trunc2i64_i64: 1462; AVX512: # %bb.0: # %entry 1463; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1464; AVX512-NEXT: vmovq %xmm0, %rax 1465; AVX512-NEXT: retq 1466entry: 1467 %0 = trunc <2 x i64> %inval to <2 x i32> 1468 %1 = bitcast <2 x i32> %0 to i64 1469 ret i64 %1 1470} 1471 1472define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { 1473; SSE2-LABEL: trunc2x4i32_8i16: 1474; SSE2: # %bb.0: # %entry 1475; SSE2-NEXT: pslld $16, %xmm1 1476; SSE2-NEXT: psrad $16, %xmm1 1477; SSE2-NEXT: pslld $16, %xmm0 1478; SSE2-NEXT: psrad $16, %xmm0 1479; SSE2-NEXT: packssdw %xmm1, %xmm0 1480; SSE2-NEXT: retq 1481; 1482; SSSE3-LABEL: trunc2x4i32_8i16: 1483; SSSE3: # %bb.0: # %entry 1484; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1485; SSSE3-NEXT: pshufb %xmm2, %xmm1 1486; SSSE3-NEXT: pshufb %xmm2, %xmm0 1487; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1488; SSSE3-NEXT: retq 1489; 1490; SSE41-LABEL: trunc2x4i32_8i16: 1491; SSE41: # %bb.0: # %entry 1492; SSE41-NEXT: pxor %xmm2, %xmm2 1493; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1494; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1495; SSE41-NEXT: packusdw %xmm1, %xmm0 1496; SSE41-NEXT: retq 1497; 1498; AVX-LABEL: trunc2x4i32_8i16: 1499; AVX: # %bb.0: # %entry 1500; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1501; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1502; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1503; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1504; AVX-NEXT: retq 1505; 1506; AVX512F-LABEL: trunc2x4i32_8i16: 1507; AVX512F: # %bb.0: # %entry 1508; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1509; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1510; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1511; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1512; AVX512F-NEXT: vzeroupper 1513; AVX512F-NEXT: retq 1514; 1515; AVX512VL-LABEL: trunc2x4i32_8i16: 1516; AVX512VL: # %bb.0: # %entry 1517; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1518; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1519; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1520; AVX512VL-NEXT: vzeroupper 1521; AVX512VL-NEXT: retq 1522; 1523; AVX512BW-LABEL: trunc2x4i32_8i16: 1524; AVX512BW: # %bb.0: # %entry 1525; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1526; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1527; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1528; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1529; AVX512BW-NEXT: vzeroupper 1530; AVX512BW-NEXT: retq 1531; 1532; AVX512BWVL-LABEL: trunc2x4i32_8i16: 1533; AVX512BWVL: # %bb.0: # %entry 1534; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1535; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1536; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1537; AVX512BWVL-NEXT: vzeroupper 1538; AVX512BWVL-NEXT: retq 1539entry: 1540 %0 = trunc <4 x i32> %a to <4 x i16> 1541 %1 = trunc <4 x i32> %b to <4 x i16> 1542 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1543 ret <8 x i16> %2 1544} 1545 1546; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1547define i64 @trunc4i32_i64(<4 x i32> %inval) { 1548; SSE2-LABEL: trunc4i32_i64: 1549; SSE2: # %bb.0: # %entry 1550; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1551; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1552; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1553; SSE2-NEXT: movq %xmm0, %rax 1554; SSE2-NEXT: retq 1555; 1556; SSSE3-LABEL: trunc4i32_i64: 1557; SSSE3: # %bb.0: # %entry 1558; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1559; SSSE3-NEXT: movq %xmm0, %rax 1560; SSSE3-NEXT: retq 1561; 1562; SSE41-LABEL: trunc4i32_i64: 1563; SSE41: # %bb.0: # %entry 1564; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1565; SSE41-NEXT: movq %xmm0, %rax 1566; SSE41-NEXT: retq 1567; 1568; AVX-LABEL: trunc4i32_i64: 1569; AVX: # %bb.0: # %entry 1570; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1571; AVX-NEXT: vmovq %xmm0, %rax 1572; AVX-NEXT: retq 1573; 1574; AVX512F-LABEL: trunc4i32_i64: 1575; AVX512F: # %bb.0: # %entry 1576; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1577; AVX512F-NEXT: vmovq %xmm0, %rax 1578; AVX512F-NEXT: retq 1579; 1580; AVX512VL-LABEL: trunc4i32_i64: 1581; AVX512VL: # %bb.0: # %entry 1582; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 1583; AVX512VL-NEXT: vmovq %xmm0, %rax 1584; AVX512VL-NEXT: retq 1585; 1586; AVX512BW-LABEL: trunc4i32_i64: 1587; AVX512BW: # %bb.0: # %entry 1588; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1589; AVX512BW-NEXT: vmovq %xmm0, %rax 1590; AVX512BW-NEXT: retq 1591; 1592; AVX512BWVL-LABEL: trunc4i32_i64: 1593; AVX512BWVL: # %bb.0: # %entry 1594; AVX512BWVL-NEXT: vpmovdw %xmm0, %xmm0 1595; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1596; AVX512BWVL-NEXT: retq 1597entry: 1598 %0 = trunc <4 x i32> %inval to <4 x i16> 1599 %1 = bitcast <4 x i16> %0 to i64 1600 ret i64 %1 1601} 1602 1603define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) { 1604; SSE2-SSSE3-LABEL: trunc2x16i16_32i8: 1605; SSE2-SSSE3: # %bb.0: # %entry 1606; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1607; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 1608; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 1609; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 1610; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 1611; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 1612; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm4 1613; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 1614; SSE2-SSSE3-NEXT: retq 1615; 1616; SSE41-LABEL: trunc2x16i16_32i8: 1617; SSE41: # %bb.0: # %entry 1618; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1619; SSE41-NEXT: pand %xmm4, %xmm1 1620; SSE41-NEXT: pand %xmm4, %xmm0 1621; SSE41-NEXT: packuswb %xmm1, %xmm0 1622; SSE41-NEXT: pand %xmm4, %xmm3 1623; SSE41-NEXT: pand %xmm2, %xmm4 1624; SSE41-NEXT: packuswb %xmm3, %xmm4 1625; SSE41-NEXT: movdqa %xmm4, %xmm1 1626; SSE41-NEXT: retq 1627; 1628; AVX1-LABEL: trunc2x16i16_32i8: 1629; AVX1: # %bb.0: # %entry 1630; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1631; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1632; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1633; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1634; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1635; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1636; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1637; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1638; AVX1-NEXT: retq 1639; 1640; AVX2-LABEL: trunc2x16i16_32i8: 1641; AVX2: # %bb.0: # %entry 1642; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1643; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1644; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1645; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1646; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1647; AVX2-NEXT: retq 1648; 1649; AVX512F-LABEL: trunc2x16i16_32i8: 1650; AVX512F: # %bb.0: # %entry 1651; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1652; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1653; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1654; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 1655; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1656; AVX512F-NEXT: retq 1657; 1658; AVX512VL-LABEL: trunc2x16i16_32i8: 1659; AVX512VL: # %bb.0: # %entry 1660; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1661; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1662; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1663; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 1664; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1665; AVX512VL-NEXT: retq 1666; 1667; AVX512BW-LABEL: trunc2x16i16_32i8: 1668; AVX512BW: # %bb.0: # %entry 1669; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1670; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1671; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1672; AVX512BW-NEXT: retq 1673; 1674; AVX512BWVL-LABEL: trunc2x16i16_32i8: 1675; AVX512BWVL: # %bb.0: # %entry 1676; AVX512BWVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1677; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1678; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1679; AVX512BWVL-NEXT: retq 1680entry: 1681 %0 = trunc <16 x i16> %a to <16 x i8> 1682 %1 = trunc <16 x i16> %b to <16 x i8> 1683 %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1684 ret <32 x i8> %2 1685} 1686 1687define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { 1688; SSE2-SSSE3-LABEL: trunc2x8i16_16i8: 1689; SSE2-SSSE3: # %bb.0: # %entry 1690; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1691; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 1692; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 1693; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 1694; SSE2-SSSE3-NEXT: retq 1695; 1696; SSE41-LABEL: trunc2x8i16_16i8: 1697; SSE41: # %bb.0: # %entry 1698; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1699; SSE41-NEXT: pand %xmm2, %xmm1 1700; SSE41-NEXT: pand %xmm2, %xmm0 1701; SSE41-NEXT: packuswb %xmm1, %xmm0 1702; SSE41-NEXT: retq 1703; 1704; AVX1-LABEL: trunc2x8i16_16i8: 1705; AVX1: # %bb.0: # %entry 1706; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1707; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1708; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1709; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1710; AVX1-NEXT: retq 1711; 1712; AVX2-LABEL: trunc2x8i16_16i8: 1713; AVX2: # %bb.0: # %entry 1714; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1715; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 1716; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1717; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1718; AVX2-NEXT: retq 1719; 1720; AVX512F-LABEL: trunc2x8i16_16i8: 1721; AVX512F: # %bb.0: # %entry 1722; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1723; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 1724; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 1725; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1726; AVX512F-NEXT: retq 1727; 1728; AVX512VL-LABEL: trunc2x8i16_16i8: 1729; AVX512VL: # %bb.0: # %entry 1730; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1731; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 1732; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 1733; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1734; AVX512VL-NEXT: retq 1735; 1736; AVX512BW-LABEL: trunc2x8i16_16i8: 1737; AVX512BW: # %bb.0: # %entry 1738; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1739; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1740; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1741; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1742; AVX512BW-NEXT: vzeroupper 1743; AVX512BW-NEXT: retq 1744; 1745; AVX512BWVL-LABEL: trunc2x8i16_16i8: 1746; AVX512BWVL: # %bb.0: # %entry 1747; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1748; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1749; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1750; AVX512BWVL-NEXT: vzeroupper 1751; AVX512BWVL-NEXT: retq 1752entry: 1753 %0 = trunc <8 x i16> %a to <8 x i8> 1754 %1 = trunc <8 x i16> %b to <8 x i8> 1755 %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1756 ret <16 x i8> %2 1757} 1758 1759; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1760define i64 @trunc8i16_i64(<8 x i16> %inval) { 1761; SSE2-LABEL: trunc8i16_i64: 1762; SSE2: # %bb.0: # %entry 1763; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1764; SSE2-NEXT: packuswb %xmm0, %xmm0 1765; SSE2-NEXT: movq %xmm0, %rax 1766; SSE2-NEXT: retq 1767; 1768; SSSE3-LABEL: trunc8i16_i64: 1769; SSSE3: # %bb.0: # %entry 1770; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1771; SSSE3-NEXT: movq %xmm0, %rax 1772; SSSE3-NEXT: retq 1773; 1774; SSE41-LABEL: trunc8i16_i64: 1775; SSE41: # %bb.0: # %entry 1776; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1777; SSE41-NEXT: movq %xmm0, %rax 1778; SSE41-NEXT: retq 1779; 1780; AVX-LABEL: trunc8i16_i64: 1781; AVX: # %bb.0: # %entry 1782; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1783; AVX-NEXT: vmovq %xmm0, %rax 1784; AVX-NEXT: retq 1785; 1786; AVX512F-LABEL: trunc8i16_i64: 1787; AVX512F: # %bb.0: # %entry 1788; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1789; AVX512F-NEXT: vmovq %xmm0, %rax 1790; AVX512F-NEXT: retq 1791; 1792; AVX512VL-LABEL: trunc8i16_i64: 1793; AVX512VL: # %bb.0: # %entry 1794; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1795; AVX512VL-NEXT: vmovq %xmm0, %rax 1796; AVX512VL-NEXT: retq 1797; 1798; AVX512BW-LABEL: trunc8i16_i64: 1799; AVX512BW: # %bb.0: # %entry 1800; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1801; AVX512BW-NEXT: vmovq %xmm0, %rax 1802; AVX512BW-NEXT: retq 1803; 1804; AVX512BWVL-LABEL: trunc8i16_i64: 1805; AVX512BWVL: # %bb.0: # %entry 1806; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 1807; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1808; AVX512BWVL-NEXT: retq 1809entry: 1810 %0 = trunc <8 x i16> %inval to <8 x i8> 1811 %1 = bitcast <8 x i8> %0 to i64 1812 ret i64 %1 1813} 1814 1815define <16 x i8> @trunc16i64_16i8_const() { 1816; SSE-LABEL: trunc16i64_16i8_const: 1817; SSE: # %bb.0: # %entry 1818; SSE-NEXT: xorps %xmm0, %xmm0 1819; SSE-NEXT: retq 1820; 1821; AVX-LABEL: trunc16i64_16i8_const: 1822; AVX: # %bb.0: # %entry 1823; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1824; AVX-NEXT: retq 1825; 1826; AVX512-LABEL: trunc16i64_16i8_const: 1827; AVX512: # %bb.0: # %entry 1828; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1829; AVX512-NEXT: retq 1830 1831entry: 1832 %0 = trunc <16 x i64> zeroinitializer to <16 x i8> 1833 %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26> 1834 ret <16 x i8> %1 1835} 1836 1837define <8 x i16> @PR32160(<8 x i32> %x) { 1838; SSE-LABEL: PR32160: 1839; SSE: # %bb.0: 1840; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 1841; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1842; SSE-NEXT: retq 1843; 1844; AVX-LABEL: PR32160: 1845; AVX: # %bb.0: 1846; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] 1847; AVX-NEXT: vzeroupper 1848; AVX-NEXT: retq 1849; 1850; AVX512F-LABEL: PR32160: 1851; AVX512F: # %bb.0: 1852; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1853; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1854; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1855; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 1856; AVX512F-NEXT: vzeroupper 1857; AVX512F-NEXT: retq 1858; 1859; AVX512VL-LABEL: PR32160: 1860; AVX512VL: # %bb.0: 1861; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1862; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1863; AVX512VL-NEXT: vzeroupper 1864; AVX512VL-NEXT: retq 1865; 1866; AVX512BW-LABEL: PR32160: 1867; AVX512BW: # %bb.0: 1868; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1869; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1870; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1871; AVX512BW-NEXT: vzeroupper 1872; AVX512BW-NEXT: retq 1873; 1874; AVX512BWVL-LABEL: PR32160: 1875; AVX512BWVL: # %bb.0: 1876; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1877; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1878; AVX512BWVL-NEXT: vzeroupper 1879; AVX512BWVL-NEXT: retq 1880 %shuf = trunc <8 x i32> %x to <8 x i16> 1881 %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1882 ret <8 x i16> %trunc 1883} 1884 1885define void @PR34773(ptr %a0, ptr %a1) { 1886; SSE-LABEL: PR34773: 1887; SSE: # %bb.0: 1888; SSE-NEXT: movdqu (%rdi), %xmm0 1889; SSE-NEXT: movdqu 16(%rdi), %xmm1 1890; SSE-NEXT: movdqu 32(%rdi), %xmm2 1891; SSE-NEXT: movdqu 48(%rdi), %xmm3 1892; SSE-NEXT: psrlw $8, %xmm1 1893; SSE-NEXT: psrlw $8, %xmm0 1894; SSE-NEXT: packuswb %xmm1, %xmm0 1895; SSE-NEXT: psrlw $8, %xmm3 1896; SSE-NEXT: psrlw $8, %xmm2 1897; SSE-NEXT: packuswb %xmm3, %xmm2 1898; SSE-NEXT: movdqu %xmm0, (%rsi) 1899; SSE-NEXT: movdqu %xmm2, 16(%rsi) 1900; SSE-NEXT: retq 1901; 1902; AVX1-LABEL: PR34773: 1903; AVX1: # %bb.0: 1904; AVX1-NEXT: vmovdqu (%rdi), %xmm0 1905; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 1906; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 1907; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 1908; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1909; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1910; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1911; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1912; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1913; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1914; AVX1-NEXT: vmovdqu %xmm0, (%rsi) 1915; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) 1916; AVX1-NEXT: retq 1917; 1918; AVX2-LABEL: PR34773: 1919; AVX2: # %bb.0: 1920; AVX2-NEXT: vmovdqu (%rdi), %ymm0 1921; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 1922; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1923; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1924; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1925; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1926; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1927; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1928; AVX2-NEXT: vmovdqu %xmm0, (%rsi) 1929; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) 1930; AVX2-NEXT: vzeroupper 1931; AVX2-NEXT: retq 1932; 1933; AVX512F-LABEL: PR34773: 1934; AVX512F: # %bb.0: 1935; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 1936; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 1937; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1938; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 1939; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1940; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 1941; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1942; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) 1943; AVX512F-NEXT: vzeroupper 1944; AVX512F-NEXT: retq 1945; 1946; AVX512VL-LABEL: PR34773: 1947; AVX512VL: # %bb.0: 1948; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 1949; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 1950; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1951; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 1952; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1953; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 1954; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1955; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) 1956; AVX512VL-NEXT: vzeroupper 1957; AVX512VL-NEXT: retq 1958; 1959; AVX512BW-LABEL: PR34773: 1960; AVX512BW: # %bb.0: 1961; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 1962; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 1963; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1964; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 1965; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1966; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 1967; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 1968; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) 1969; AVX512BW-NEXT: vzeroupper 1970; AVX512BW-NEXT: retq 1971; 1972; AVX512BWVL-LABEL: PR34773: 1973; AVX512BWVL: # %bb.0: 1974; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 1975; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 1976; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 1977; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) 1978; AVX512BWVL-NEXT: vzeroupper 1979; AVX512BWVL-NEXT: retq 1980 %1 = getelementptr i16, ptr %a0, i64 16 1981 %2 = getelementptr i8, ptr %a1, i64 16 1982 %3 = load <16 x i16>, ptr %a0, align 2 1983 %4 = load <16 x i16>, ptr %1, align 2 1984 %5 = lshr <16 x i16> %3, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1985 %6 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1986 %7 = trunc <16 x i16> %5 to <16 x i8> 1987 %8 = trunc <16 x i16> %6 to <16 x i8> 1988 store <16 x i8> %7, ptr %a1, align 1 1989 store <16 x i8> %8, ptr %2, align 1 1990 ret void 1991} 1992 1993define i16 @PR66194(i8 %q) { 1994; SSE2-SSSE3-LABEL: PR66194: 1995; SSE2-SSSE3: # %bb.0: # %entry 1996; SSE2-SSSE3-NEXT: xorl %eax, %eax 1997; SSE2-SSSE3-NEXT: xorl %ecx, %ecx 1998; SSE2-SSSE3-NEXT: testb %dil, %dil 1999; SSE2-SSSE3-NEXT: setne %al 2000; SSE2-SSSE3-NEXT: sete %cl 2001; SSE2-SSSE3-NEXT: movl %ecx, %edx 2002; SSE2-SSSE3-NEXT: shll $16, %edx 2003; SSE2-SSSE3-NEXT: orl %eax, %edx 2004; SSE2-SSSE3-NEXT: movd %edx, %xmm0 2005; SSE2-SSSE3-NEXT: pinsrw $2, %eax, %xmm0 2006; SSE2-SSSE3-NEXT: pinsrw $3, %eax, %xmm0 2007; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 2008; SSE2-SSSE3-NEXT: pinsrw $5, %eax, %xmm0 2009; SSE2-SSSE3-NEXT: pinsrw $6, %eax, %xmm0 2010; SSE2-SSSE3-NEXT: pinsrw $7, %ecx, %xmm0 2011; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 2012; SSE2-SSSE3-NEXT: psubw %xmm1, %xmm0 2013; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 2014; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 2015; SSE2-SSSE3-NEXT: psadbw %xmm0, %xmm1 2016; SSE2-SSSE3-NEXT: movd %xmm1, %eax 2017; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax 2018; SSE2-SSSE3-NEXT: retq 2019; 2020; SSE41-LABEL: PR66194: 2021; SSE41: # %bb.0: # %entry 2022; SSE41-NEXT: xorl %eax, %eax 2023; SSE41-NEXT: xorl %ecx, %ecx 2024; SSE41-NEXT: testb %dil, %dil 2025; SSE41-NEXT: setne %al 2026; SSE41-NEXT: sete %cl 2027; SSE41-NEXT: movd %eax, %xmm0 2028; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 2029; SSE41-NEXT: pinsrb $4, %eax, %xmm0 2030; SSE41-NEXT: pinsrb $6, %eax, %xmm0 2031; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 2032; SSE41-NEXT: pinsrb $10, %eax, %xmm0 2033; SSE41-NEXT: pinsrb $12, %eax, %xmm0 2034; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 2035; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 2036; SSE41-NEXT: psubw %xmm1, %xmm0 2037; SSE41-NEXT: packuswb %xmm0, %xmm0 2038; SSE41-NEXT: pxor %xmm1, %xmm1 2039; SSE41-NEXT: psadbw %xmm0, %xmm1 2040; SSE41-NEXT: movd %xmm1, %eax 2041; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 2042; SSE41-NEXT: retq 2043; 2044; AVX1-LABEL: PR66194: 2045; AVX1: # %bb.0: # %entry 2046; AVX1-NEXT: xorl %eax, %eax 2047; AVX1-NEXT: testb %dil, %dil 2048; AVX1-NEXT: setne %al 2049; AVX1-NEXT: sete %cl 2050; AVX1-NEXT: vmovd %eax, %xmm0 2051; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 2052; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2053; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2054; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 2055; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2056; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2057; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 2058; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2059; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 2060; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2061; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2062; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2063; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2064; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 2065; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2066; AVX1-NEXT: vmovd %xmm0, %eax 2067; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 2068; AVX1-NEXT: retq 2069; 2070; AVX2-LABEL: PR66194: 2071; AVX2: # %bb.0: # %entry 2072; AVX2-NEXT: xorl %eax, %eax 2073; AVX2-NEXT: xorl %ecx, %ecx 2074; AVX2-NEXT: testb %dil, %dil 2075; AVX2-NEXT: setne %al 2076; AVX2-NEXT: sete %cl 2077; AVX2-NEXT: vmovd %eax, %xmm0 2078; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 2079; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2080; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2081; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 2082; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2083; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2084; AVX2-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 2085; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2086; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2087; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2088; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2089; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2090; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2091; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2092; AVX2-NEXT: vmovd %xmm0, %eax 2093; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 2094; AVX2-NEXT: retq 2095; 2096; AVX512-LABEL: PR66194: 2097; AVX512: # %bb.0: # %entry 2098; AVX512-NEXT: xorl %eax, %eax 2099; AVX512-NEXT: xorl %ecx, %ecx 2100; AVX512-NEXT: testb %dil, %dil 2101; AVX512-NEXT: setne %al 2102; AVX512-NEXT: sete %cl 2103; AVX512-NEXT: vmovd %eax, %xmm0 2104; AVX512-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 2105; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2106; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2107; AVX512-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 2108; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2109; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2110; AVX512-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 2111; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2112; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2113; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2114; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2115; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2116; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 2117; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2118; AVX512-NEXT: vmovd %xmm0, %eax 2119; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 2120; AVX512-NEXT: retq 2121entry: 2122 %cmp12.i.13 = icmp ne i8 %q, 0 2123 %cond.i15.13 = zext i1 %cmp12.i.13 to i16 2124 %tobool.not.i.13 = icmp eq i8 %q, 0 2125 %cond18.i.13 = zext i1 %tobool.not.i.13 to i16 2126 %0 = insertelement <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i16 %cond.i15.13, i64 8 2127 %1 = insertelement <16 x i16> %0, i16 %cond18.i.13, i64 9 2128 %2 = insertelement <16 x i16> %1, i16 %cond.i15.13, i64 10 2129 %3 = insertelement <16 x i16> %2, i16 %cond.i15.13, i64 11 2130 %4 = insertelement <16 x i16> %3, i16 %cond18.i.13, i64 12 2131 %5 = insertelement <16 x i16> %4, i16 %cond.i15.13, i64 13 2132 %6 = insertelement <16 x i16> %5, i16 %cond.i15.13, i64 14 2133 %7 = insertelement <16 x i16> %6, i16 %cond18.i.13, i64 15 2134 %8 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7) 2135 ret i16 %8 2136} 2137declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 2138 2139; Store merging must not infinitely fight store splitting. 2140 2141define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) align 2 { 2142; SSE2-LABEL: store_merge_split: 2143; SSE2: # %bb.0: 2144; SSE2-NEXT: pslld $16, %xmm1 2145; SSE2-NEXT: psrad $16, %xmm1 2146; SSE2-NEXT: pslld $16, %xmm0 2147; SSE2-NEXT: psrad $16, %xmm0 2148; SSE2-NEXT: packssdw %xmm1, %xmm0 2149; SSE2-NEXT: pslld $16, %xmm3 2150; SSE2-NEXT: psrad $16, %xmm3 2151; SSE2-NEXT: pslld $16, %xmm2 2152; SSE2-NEXT: psrad $16, %xmm2 2153; SSE2-NEXT: packssdw %xmm3, %xmm2 2154; SSE2-NEXT: shlq $4, %rdi 2155; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) 2156; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2157; SSE2-NEXT: retq 2158; 2159; SSSE3-LABEL: store_merge_split: 2160; SSSE3: # %bb.0: 2161; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2162; SSSE3-NEXT: pshufb %xmm4, %xmm1 2163; SSSE3-NEXT: pshufb %xmm4, %xmm0 2164; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2165; SSSE3-NEXT: pshufb %xmm4, %xmm3 2166; SSSE3-NEXT: pshufb %xmm4, %xmm2 2167; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2168; SSSE3-NEXT: shlq $4, %rdi 2169; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) 2170; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2171; SSSE3-NEXT: retq 2172; 2173; SSE41-LABEL: store_merge_split: 2174; SSE41: # %bb.0: 2175; SSE41-NEXT: pxor %xmm4, %xmm4 2176; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 2177; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 2178; SSE41-NEXT: packusdw %xmm1, %xmm0 2179; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 2180; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 2181; SSE41-NEXT: packusdw %xmm3, %xmm2 2182; SSE41-NEXT: shlq $4, %rdi 2183; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) 2184; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2185; SSE41-NEXT: retq 2186; 2187; AVX1-LABEL: store_merge_split: 2188; AVX1: # %bb.0: 2189; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 2190; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2191; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2192; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2193; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2194; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2195; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2196; AVX1-NEXT: shlq $4, %rdi 2197; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2198; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2199; AVX1-NEXT: vzeroupper 2200; AVX1-NEXT: retq 2201; 2202; AVX2-LABEL: store_merge_split: 2203; AVX2: # %bb.0: 2204; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2205; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2206; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2207; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2208; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2209; AVX2-NEXT: shlq $4, %rdi 2210; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2211; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2212; AVX2-NEXT: vzeroupper 2213; AVX2-NEXT: retq 2214; 2215; AVX512F-LABEL: store_merge_split: 2216; AVX512F: # %bb.0: 2217; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2218; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2219; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2220; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 2221; AVX512F-NEXT: shlq $4, %rdi 2222; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2223; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2224; AVX512F-NEXT: vzeroupper 2225; AVX512F-NEXT: retq 2226; 2227; AVX512VL-LABEL: store_merge_split: 2228; AVX512VL: # %bb.0: 2229; AVX512VL-NEXT: shlq $4, %rdi 2230; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2231; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2232; AVX512VL-NEXT: vzeroupper 2233; AVX512VL-NEXT: retq 2234; 2235; AVX512BW-LABEL: store_merge_split: 2236; AVX512BW: # %bb.0: 2237; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2238; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2239; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 2240; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 2241; AVX512BW-NEXT: shlq $4, %rdi 2242; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2243; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2244; AVX512BW-NEXT: vzeroupper 2245; AVX512BW-NEXT: retq 2246; 2247; AVX512BWVL-LABEL: store_merge_split: 2248; AVX512BWVL: # %bb.0: 2249; AVX512BWVL-NEXT: shlq $4, %rdi 2250; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2251; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2252; AVX512BWVL-NEXT: vzeroupper 2253; AVX512BWVL-NEXT: retq 2254 %t1 = trunc <8 x i32> %w1 to <8 x i16> 2255 %t2 = trunc <8 x i32> %w2 to <8 x i16> 2256 %g1 = getelementptr inbounds <8 x i16>, ptr %p, i64 %idx 2257 %g2 = getelementptr inbounds <8 x i16>, ptr %g1, i64 1 2258 store <8 x i16> %t1, ptr %g1, align 2 2259 store <8 x i16> %t2, ptr %g2, align 2 2260 ret void 2261} 2262