1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 16; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX 17 18; 19; PACKUS saturation truncation to vXi32 20; 21 22define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { 23; SSE2-SSSE3-LABEL: trunc_packus_v2i64_v2i32: 24; SSE2-SSSE3: # %bb.0: 25; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 26; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 27; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 28; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 29; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 30; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 31; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 32; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 33; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 34; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 35; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 36; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 37; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 38; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 39; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 40; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 41; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 42; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 43; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 44; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 45; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 46; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 47; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 48; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 49; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 50; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 51; SSE2-SSSE3-NEXT: retq 52; 53; SSE41-LABEL: trunc_packus_v2i64_v2i32: 54; SSE41: # %bb.0: 55; SSE41-NEXT: movdqa %xmm0, %xmm1 56; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] 57; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 58; SSE41-NEXT: pxor %xmm3, %xmm0 59; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] 60; SSE41-NEXT: movdqa %xmm0, %xmm5 61; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 62; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 63; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 64; SSE41-NEXT: pand %xmm5, %xmm0 65; SSE41-NEXT: por %xmm4, %xmm0 66; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 67; SSE41-NEXT: xorpd %xmm1, %xmm1 68; SSE41-NEXT: movapd %xmm2, %xmm4 69; SSE41-NEXT: xorpd %xmm3, %xmm4 70; SSE41-NEXT: movapd %xmm4, %xmm5 71; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 72; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 73; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 74; SSE41-NEXT: pand %xmm5, %xmm0 75; SSE41-NEXT: por %xmm4, %xmm0 76; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 77; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 78; SSE41-NEXT: retq 79; 80; AVX-LABEL: trunc_packus_v2i64_v2i32: 81; AVX: # %bb.0: 82; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] 83; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 84; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 85; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 86; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 87; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 88; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 89; AVX-NEXT: retq 90; 91; AVX512F-LABEL: trunc_packus_v2i64_v2i32: 92; AVX512F: # %bb.0: 93; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 94; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 95; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 96; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 97; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 98; AVX512F-NEXT: vzeroupper 99; AVX512F-NEXT: retq 100; 101; AVX512VL-LABEL: trunc_packus_v2i64_v2i32: 102; AVX512VL: # %bb.0: 103; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 104; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 105; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 106; AVX512VL-NEXT: retq 107; 108; AVX512BW-LABEL: trunc_packus_v2i64_v2i32: 109; AVX512BW: # %bb.0: 110; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 111; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 112; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 113; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 114; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 115; AVX512BW-NEXT: vzeroupper 116; AVX512BW-NEXT: retq 117; 118; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32: 119; AVX512BWVL: # %bb.0: 120; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 121; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 122; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 123; AVX512BWVL-NEXT: retq 124; 125; SKX-LABEL: trunc_packus_v2i64_v2i32: 126; SKX: # %bb.0: 127; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 128; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 129; SKX-NEXT: vpmovusqd %xmm0, %xmm0 130; SKX-NEXT: retq 131 %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295> 132 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295> 133 %3 = icmp sgt <2 x i64> %2, zeroinitializer 134 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 135 %5 = trunc <2 x i64> %4 to <2 x i32> 136 ret <2 x i32> %5 137} 138 139define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { 140; SSE2-SSSE3-LABEL: trunc_packus_v2i64_v2i32_store: 141; SSE2-SSSE3: # %bb.0: 142; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 143; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 144; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 145; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 146; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 147; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 148; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 149; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 150; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 151; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 152; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 153; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 154; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 155; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 156; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 157; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 158; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 159; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 160; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 161; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 162; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 163; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 164; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 165; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 166; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 167; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 168; SSE2-SSSE3-NEXT: movq %xmm0, (%rdi) 169; SSE2-SSSE3-NEXT: retq 170; 171; SSE41-LABEL: trunc_packus_v2i64_v2i32_store: 172; SSE41: # %bb.0: 173; SSE41-NEXT: movdqa %xmm0, %xmm1 174; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] 175; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 176; SSE41-NEXT: pxor %xmm3, %xmm0 177; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] 178; SSE41-NEXT: movdqa %xmm0, %xmm5 179; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 180; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 181; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 182; SSE41-NEXT: pand %xmm5, %xmm0 183; SSE41-NEXT: por %xmm4, %xmm0 184; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 185; SSE41-NEXT: xorpd %xmm1, %xmm1 186; SSE41-NEXT: movapd %xmm2, %xmm4 187; SSE41-NEXT: xorpd %xmm3, %xmm4 188; SSE41-NEXT: movapd %xmm4, %xmm5 189; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 190; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 191; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 192; SSE41-NEXT: pand %xmm5, %xmm0 193; SSE41-NEXT: por %xmm4, %xmm0 194; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 195; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 196; SSE41-NEXT: movq %xmm0, (%rdi) 197; SSE41-NEXT: retq 198; 199; AVX-LABEL: trunc_packus_v2i64_v2i32_store: 200; AVX: # %bb.0: 201; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] 202; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 203; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 204; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 205; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 206; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 207; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 208; AVX-NEXT: vmovq %xmm0, (%rdi) 209; AVX-NEXT: retq 210; 211; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: 212; AVX512F: # %bb.0: 213; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 214; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 215; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 216; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 217; AVX512F-NEXT: vmovq %xmm0, (%rdi) 218; AVX512F-NEXT: vzeroupper 219; AVX512F-NEXT: retq 220; 221; AVX512VL-LABEL: trunc_packus_v2i64_v2i32_store: 222; AVX512VL: # %bb.0: 223; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 224; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 225; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) 226; AVX512VL-NEXT: retq 227; 228; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store: 229; AVX512BW: # %bb.0: 230; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 231; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 232; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 233; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 234; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 235; AVX512BW-NEXT: vzeroupper 236; AVX512BW-NEXT: retq 237; 238; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32_store: 239; AVX512BWVL: # %bb.0: 240; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 241; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 242; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) 243; AVX512BWVL-NEXT: retq 244; 245; SKX-LABEL: trunc_packus_v2i64_v2i32_store: 246; SKX: # %bb.0: 247; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 248; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 249; SKX-NEXT: vpmovusqd %xmm0, (%rdi) 250; SKX-NEXT: retq 251 %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295> 252 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295> 253 %3 = icmp sgt <2 x i64> %2, zeroinitializer 254 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 255 %5 = trunc <2 x i64> %4 to <2 x i32> 256 store <2 x i32> %5, ptr %p1 257 ret void 258} 259 260define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { 261; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i32: 262; SSE2-SSSE3: # %bb.0: 263; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] 264; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 265; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 266; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 267; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 268; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 269; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 270; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] 271; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 272; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 273; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] 274; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 275; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] 276; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 277; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 278; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 279; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 280; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 281; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 282; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 283; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 284; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 285; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] 286; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 287; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 288; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 289; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 290; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 291; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 292; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 293; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 294; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 295; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 296; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 297; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 298; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 299; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 300; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 301; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 302; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 303; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 304; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 305; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 306; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 307; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 308; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 309; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 310; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 311; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 312; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] 313; SSE2-SSSE3-NEXT: retq 314; 315; SSE41-LABEL: trunc_packus_v4i64_v4i32: 316; SSE41: # %bb.0: 317; SSE41-NEXT: movdqa %xmm0, %xmm2 318; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] 319; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 320; SSE41-NEXT: movdqa %xmm0, %xmm5 321; SSE41-NEXT: pxor %xmm3, %xmm5 322; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] 323; SSE41-NEXT: movdqa %xmm6, %xmm7 324; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 325; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 326; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 327; SSE41-NEXT: pand %xmm5, %xmm0 328; SSE41-NEXT: por %xmm7, %xmm0 329; SSE41-NEXT: movapd %xmm4, %xmm5 330; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 331; SSE41-NEXT: movdqa %xmm1, %xmm0 332; SSE41-NEXT: pxor %xmm3, %xmm0 333; SSE41-NEXT: movdqa %xmm0, %xmm2 334; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 335; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 336; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 337; SSE41-NEXT: pand %xmm2, %xmm0 338; SSE41-NEXT: por %xmm6, %xmm0 339; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 340; SSE41-NEXT: xorpd %xmm1, %xmm1 341; SSE41-NEXT: movapd %xmm4, %xmm2 342; SSE41-NEXT: xorpd %xmm3, %xmm2 343; SSE41-NEXT: movapd %xmm2, %xmm6 344; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 345; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 346; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 347; SSE41-NEXT: pand %xmm6, %xmm0 348; SSE41-NEXT: por %xmm2, %xmm0 349; SSE41-NEXT: pxor %xmm2, %xmm2 350; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 351; SSE41-NEXT: movapd %xmm5, %xmm4 352; SSE41-NEXT: xorpd %xmm3, %xmm4 353; SSE41-NEXT: movapd %xmm4, %xmm6 354; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 355; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 356; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 357; SSE41-NEXT: pand %xmm6, %xmm0 358; SSE41-NEXT: por %xmm4, %xmm0 359; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 360; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 361; SSE41-NEXT: movaps %xmm1, %xmm0 362; SSE41-NEXT: retq 363; 364; AVX1-LABEL: trunc_packus_v4i64_v4i32: 365; AVX1: # %bb.0: 366; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] 367; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 368; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 369; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 370; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 371; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 372; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 373; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 374; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 375; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 376; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 377; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] 378; AVX1-NEXT: vzeroupper 379; AVX1-NEXT: retq 380; 381; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: 382; AVX2-SLOW: # %bb.0: 383; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 384; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 385; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 386; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 387; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 388; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 389; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 390; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 391; AVX2-SLOW-NEXT: vzeroupper 392; AVX2-SLOW-NEXT: retq 393; 394; AVX2-FAST-ALL-LABEL: trunc_packus_v4i64_v4i32: 395; AVX2-FAST-ALL: # %bb.0: 396; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 397; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 398; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 399; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 400; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 401; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 402; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] 403; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 404; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 405; AVX2-FAST-ALL-NEXT: vzeroupper 406; AVX2-FAST-ALL-NEXT: retq 407; 408; AVX2-FAST-PERLANE-LABEL: trunc_packus_v4i64_v4i32: 409; AVX2-FAST-PERLANE: # %bb.0: 410; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 411; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 412; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 413; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 414; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 415; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm1, %ymm0 416; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 417; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 418; AVX2-FAST-PERLANE-NEXT: vzeroupper 419; AVX2-FAST-PERLANE-NEXT: retq 420; 421; AVX512F-LABEL: trunc_packus_v4i64_v4i32: 422; AVX512F: # %bb.0: 423; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 424; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 425; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 426; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 427; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 428; AVX512F-NEXT: vzeroupper 429; AVX512F-NEXT: retq 430; 431; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: 432; AVX512VL: # %bb.0: 433; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 434; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 435; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 436; AVX512VL-NEXT: vzeroupper 437; AVX512VL-NEXT: retq 438; 439; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: 440; AVX512BW: # %bb.0: 441; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 442; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 443; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 444; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 445; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 446; AVX512BW-NEXT: vzeroupper 447; AVX512BW-NEXT: retq 448; 449; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: 450; AVX512BWVL: # %bb.0: 451; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 452; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 453; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 454; AVX512BWVL-NEXT: vzeroupper 455; AVX512BWVL-NEXT: retq 456; 457; SKX-LABEL: trunc_packus_v4i64_v4i32: 458; SKX: # %bb.0: 459; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 460; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 461; SKX-NEXT: vpmovusqd %ymm0, %xmm0 462; SKX-NEXT: vzeroupper 463; SKX-NEXT: retq 464 %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 465 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 466 %3 = icmp sgt <4 x i64> %2, zeroinitializer 467 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 468 %5 = trunc <4 x i64> %4 to <4 x i32> 469 ret <4 x i32> %5 470} 471 472 473define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" { 474; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i32: 475; SSE2-SSSE3: # %bb.0: 476; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 477; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm8 478; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6 479; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 480; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] 481; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] 482; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 483; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 484; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] 485; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 486; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 487; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] 488; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 489; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 490; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 491; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 492; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] 493; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 494; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 495; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 496; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 497; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 498; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 499; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] 500; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 501; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 502; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 503; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 504; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 505; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] 506; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 507; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 508; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 509; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 510; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 511; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 512; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 513; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 514; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 515; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 516; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 517; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 518; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] 519; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 520; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 521; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 522; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 523; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 524; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 525; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] 526; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 527; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 528; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 529; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 530; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 531; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 532; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 533; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 534; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 535; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 536; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 537; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 538; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 539; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 540; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 541; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 542; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 543; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 544; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 545; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 546; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 547; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 548; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 549; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 550; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] 551; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 552; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] 553; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 554; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 555; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] 556; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 557; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 558; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 559; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 560; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 561; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 562; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 563; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 564; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 565; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 566; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 567; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 568; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 569; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 570; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 571; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 572; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 573; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] 574; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 575; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 576; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] 577; SSE2-SSSE3-NEXT: retq 578; 579; SSE41-LABEL: trunc_packus_v8i64_v8i32: 580; SSE41: # %bb.0: 581; SSE41-NEXT: movdqa (%rdi), %xmm5 582; SSE41-NEXT: movdqa 16(%rdi), %xmm8 583; SSE41-NEXT: movdqa 32(%rdi), %xmm7 584; SSE41-NEXT: movdqa 48(%rdi), %xmm2 585; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] 586; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 587; SSE41-NEXT: movdqa %xmm5, %xmm4 588; SSE41-NEXT: pxor %xmm3, %xmm4 589; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] 590; SSE41-NEXT: movdqa %xmm6, %xmm9 591; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 592; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 593; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 594; SSE41-NEXT: pand %xmm4, %xmm0 595; SSE41-NEXT: por %xmm9, %xmm0 596; SSE41-NEXT: movapd %xmm1, %xmm4 597; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 598; SSE41-NEXT: movdqa %xmm8, %xmm5 599; SSE41-NEXT: pxor %xmm3, %xmm5 600; SSE41-NEXT: movdqa %xmm6, %xmm9 601; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 602; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 603; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 604; SSE41-NEXT: pand %xmm5, %xmm0 605; SSE41-NEXT: por %xmm9, %xmm0 606; SSE41-NEXT: movapd %xmm1, %xmm5 607; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 608; SSE41-NEXT: movdqa %xmm7, %xmm8 609; SSE41-NEXT: pxor %xmm3, %xmm8 610; SSE41-NEXT: movdqa %xmm6, %xmm9 611; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 612; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 613; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 614; SSE41-NEXT: pand %xmm8, %xmm0 615; SSE41-NEXT: por %xmm9, %xmm0 616; SSE41-NEXT: movapd %xmm1, %xmm8 617; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 618; SSE41-NEXT: movdqa %xmm2, %xmm0 619; SSE41-NEXT: pxor %xmm3, %xmm0 620; SSE41-NEXT: movdqa %xmm0, %xmm7 621; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 622; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 623; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 624; SSE41-NEXT: pand %xmm7, %xmm0 625; SSE41-NEXT: por %xmm6, %xmm0 626; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 627; SSE41-NEXT: xorpd %xmm2, %xmm2 628; SSE41-NEXT: movapd %xmm1, %xmm6 629; SSE41-NEXT: xorpd %xmm3, %xmm6 630; SSE41-NEXT: movapd %xmm6, %xmm7 631; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 632; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 633; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 634; SSE41-NEXT: pand %xmm7, %xmm0 635; SSE41-NEXT: por %xmm6, %xmm0 636; SSE41-NEXT: pxor %xmm6, %xmm6 637; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 638; SSE41-NEXT: movapd %xmm8, %xmm1 639; SSE41-NEXT: xorpd %xmm3, %xmm1 640; SSE41-NEXT: movapd %xmm1, %xmm7 641; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 642; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 643; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 644; SSE41-NEXT: pand %xmm7, %xmm0 645; SSE41-NEXT: por %xmm1, %xmm0 646; SSE41-NEXT: pxor %xmm1, %xmm1 647; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 648; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] 649; SSE41-NEXT: movapd %xmm5, %xmm6 650; SSE41-NEXT: xorpd %xmm3, %xmm6 651; SSE41-NEXT: movapd %xmm6, %xmm7 652; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 653; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 654; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 655; SSE41-NEXT: pand %xmm7, %xmm0 656; SSE41-NEXT: por %xmm6, %xmm0 657; SSE41-NEXT: pxor %xmm6, %xmm6 658; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 659; SSE41-NEXT: movapd %xmm4, %xmm5 660; SSE41-NEXT: xorpd %xmm3, %xmm5 661; SSE41-NEXT: movapd %xmm5, %xmm7 662; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 663; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 664; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 665; SSE41-NEXT: pand %xmm7, %xmm0 666; SSE41-NEXT: por %xmm5, %xmm0 667; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 668; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] 669; SSE41-NEXT: movaps %xmm2, %xmm0 670; SSE41-NEXT: retq 671; 672; AVX1-LABEL: trunc_packus_v8i64_v8i32: 673; AVX1: # %bb.0: 674; AVX1-NEXT: vmovdqa (%rdi), %xmm0 675; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 676; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 677; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 678; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] 679; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 680; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 681; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 682; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 683; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 684; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 685; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 686; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 687; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 688; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 689; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 690; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 691; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 692; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 693; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 694; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 695; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 696; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 697; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 698; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] 699; AVX1-NEXT: retq 700; 701; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: 702; AVX2-SLOW: # %bb.0: 703; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 704; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 705; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 706; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 707; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 708; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 709; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 710; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 711; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 712; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 713; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 714; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 715; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 716; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 717; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 718; AVX2-SLOW-NEXT: retq 719; 720; AVX2-FAST-ALL-LABEL: trunc_packus_v8i64_v8i32: 721; AVX2-FAST-ALL: # %bb.0: 722; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 723; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 724; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 725; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 726; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 727; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 728; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 729; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 730; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 731; AVX2-FAST-ALL-NEXT: vpand %ymm1, %ymm3, %ymm1 732; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 733; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm2, %ymm0 734; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 735; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 736; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 737; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 738; AVX2-FAST-ALL-NEXT: retq 739; 740; AVX2-FAST-PERLANE-LABEL: trunc_packus_v8i64_v8i32: 741; AVX2-FAST-PERLANE: # %bb.0: 742; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 743; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 744; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 745; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 746; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 747; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 748; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 749; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 750; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 751; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm3, %ymm1 752; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 753; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 754; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 755; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 756; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 757; AVX2-FAST-PERLANE-NEXT: retq 758; 759; AVX512-LABEL: trunc_packus_v8i64_v8i32: 760; AVX512: # %bb.0: 761; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 762; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 763; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 764; AVX512-NEXT: retq 765; 766; SKX-LABEL: trunc_packus_v8i64_v8i32: 767; SKX: # %bb.0: 768; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 769; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm1 770; SKX-NEXT: vpmovusqd %ymm1, %xmm1 771; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm0 772; SKX-NEXT: vpmovusqd %ymm0, %xmm0 773; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 774; SKX-NEXT: retq 775 %a0 = load <8 x i64>, ptr %p0 776 %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 777 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 778 %3 = icmp sgt <8 x i64> %2, zeroinitializer 779 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 780 %5 = trunc <8 x i64> %4 to <8 x i32> 781 ret <8 x i32> %5 782} 783 784; 785; PACKUS saturation truncation to vXi16 786; 787 788define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { 789; SSE2-SSSE3-LABEL: trunc_packus_v2i64_v2i16: 790; SSE2-SSSE3: # %bb.0: 791; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 792; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 793; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 794; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 795; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 796; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 797; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 798; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 799; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 800; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 801; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 802; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 803; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 804; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 805; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 806; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 807; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 808; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 809; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 810; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 811; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 812; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 813; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 814; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 815; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 816; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 817; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 818; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 819; SSE2-SSSE3-NEXT: retq 820; 821; SSE41-LABEL: trunc_packus_v2i64_v2i16: 822; SSE41: # %bb.0: 823; SSE41-NEXT: movdqa %xmm0, %xmm1 824; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] 825; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 826; SSE41-NEXT: pxor %xmm3, %xmm0 827; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] 828; SSE41-NEXT: movdqa %xmm0, %xmm5 829; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 830; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 831; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 832; SSE41-NEXT: pand %xmm5, %xmm0 833; SSE41-NEXT: por %xmm4, %xmm0 834; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 835; SSE41-NEXT: xorpd %xmm1, %xmm1 836; SSE41-NEXT: movapd %xmm2, %xmm4 837; SSE41-NEXT: xorpd %xmm3, %xmm4 838; SSE41-NEXT: movapd %xmm4, %xmm5 839; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 840; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 841; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 842; SSE41-NEXT: pand %xmm5, %xmm0 843; SSE41-NEXT: por %xmm4, %xmm0 844; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 845; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 846; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 847; SSE41-NEXT: retq 848; 849; AVX1-LABEL: trunc_packus_v2i64_v2i16: 850; AVX1: # %bb.0: 851; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 852; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 853; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 854; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 855; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 856; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 857; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 858; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 859; AVX1-NEXT: retq 860; 861; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: 862; AVX2-SLOW: # %bb.0: 863; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 864; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 865; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 866; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 867; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 868; AVX2-SLOW-NEXT: vpand %xmm0, %xmm1, %xmm0 869; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 870; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 871; AVX2-SLOW-NEXT: retq 872; 873; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: 874; AVX2-FAST: # %bb.0: 875; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 876; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 877; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 878; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 879; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 880; AVX2-FAST-NEXT: vpand %xmm0, %xmm1, %xmm0 881; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 882; AVX2-FAST-NEXT: retq 883; 884; AVX512F-LABEL: trunc_packus_v2i64_v2i16: 885; AVX512F: # %bb.0: 886; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 887; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 888; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 889; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 890; AVX512F-NEXT: vzeroupper 891; AVX512F-NEXT: retq 892; 893; AVX512VL-LABEL: trunc_packus_v2i64_v2i16: 894; AVX512VL: # %bb.0: 895; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 896; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 897; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 898; AVX512VL-NEXT: retq 899; 900; AVX512BW-LABEL: trunc_packus_v2i64_v2i16: 901; AVX512BW: # %bb.0: 902; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 903; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 904; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 905; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 906; AVX512BW-NEXT: vzeroupper 907; AVX512BW-NEXT: retq 908; 909; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16: 910; AVX512BWVL: # %bb.0: 911; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 912; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 913; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 914; AVX512BWVL-NEXT: retq 915; 916; SKX-LABEL: trunc_packus_v2i64_v2i16: 917; SKX: # %bb.0: 918; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 919; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 920; SKX-NEXT: vpmovusqw %xmm0, %xmm0 921; SKX-NEXT: retq 922 %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535> 923 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535> 924 %3 = icmp sgt <2 x i64> %2, zeroinitializer 925 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 926 %5 = trunc <2 x i64> %4 to <2 x i16> 927 ret <2 x i16> %5 928} 929 930define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { 931; SSE2-SSSE3-LABEL: trunc_packus_v2i64_v2i16_store: 932; SSE2-SSSE3: # %bb.0: 933; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 934; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 935; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 936; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 937; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 938; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 939; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 940; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 941; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 942; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 943; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 944; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 945; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 946; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 947; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 948; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 949; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 950; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 951; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 952; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 953; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 954; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 955; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 956; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 957; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 958; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 959; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 960; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 961; SSE2-SSSE3-NEXT: movd %xmm0, (%rdi) 962; SSE2-SSSE3-NEXT: retq 963; 964; SSE41-LABEL: trunc_packus_v2i64_v2i16_store: 965; SSE41: # %bb.0: 966; SSE41-NEXT: movdqa %xmm0, %xmm1 967; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] 968; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 969; SSE41-NEXT: pxor %xmm3, %xmm0 970; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] 971; SSE41-NEXT: movdqa %xmm0, %xmm5 972; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 973; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 974; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 975; SSE41-NEXT: pand %xmm5, %xmm0 976; SSE41-NEXT: por %xmm4, %xmm0 977; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 978; SSE41-NEXT: xorpd %xmm1, %xmm1 979; SSE41-NEXT: movapd %xmm2, %xmm4 980; SSE41-NEXT: xorpd %xmm3, %xmm4 981; SSE41-NEXT: movapd %xmm4, %xmm5 982; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 983; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 984; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 985; SSE41-NEXT: pand %xmm5, %xmm0 986; SSE41-NEXT: por %xmm4, %xmm0 987; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 988; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 989; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 990; SSE41-NEXT: movd %xmm0, (%rdi) 991; SSE41-NEXT: retq 992; 993; AVX1-LABEL: trunc_packus_v2i64_v2i16_store: 994; AVX1: # %bb.0: 995; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 996; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 997; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 998; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 999; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1000; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 1001; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1002; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1003; AVX1-NEXT: vmovd %xmm0, (%rdi) 1004; AVX1-NEXT: retq 1005; 1006; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: 1007; AVX2-SLOW: # %bb.0: 1008; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 1009; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1010; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1011; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1012; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1013; AVX2-SLOW-NEXT: vpand %xmm0, %xmm1, %xmm0 1014; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1015; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1016; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) 1017; AVX2-SLOW-NEXT: retq 1018; 1019; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: 1020; AVX2-FAST: # %bb.0: 1021; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 1022; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1023; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1024; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 1025; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1026; AVX2-FAST-NEXT: vpand %xmm0, %xmm1, %xmm0 1027; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 1028; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) 1029; AVX2-FAST-NEXT: retq 1030; 1031; AVX512F-LABEL: trunc_packus_v2i64_v2i16_store: 1032; AVX512F: # %bb.0: 1033; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1034; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1035; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1036; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1037; AVX512F-NEXT: vmovd %xmm0, (%rdi) 1038; AVX512F-NEXT: vzeroupper 1039; AVX512F-NEXT: retq 1040; 1041; AVX512VL-LABEL: trunc_packus_v2i64_v2i16_store: 1042; AVX512VL: # %bb.0: 1043; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1044; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1045; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) 1046; AVX512VL-NEXT: retq 1047; 1048; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store: 1049; AVX512BW: # %bb.0: 1050; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1051; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1052; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1053; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1054; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 1055; AVX512BW-NEXT: vzeroupper 1056; AVX512BW-NEXT: retq 1057; 1058; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16_store: 1059; AVX512BWVL: # %bb.0: 1060; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1061; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1062; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) 1063; AVX512BWVL-NEXT: retq 1064; 1065; SKX-LABEL: trunc_packus_v2i64_v2i16_store: 1066; SKX: # %bb.0: 1067; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1068; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1069; SKX-NEXT: vpmovusqw %xmm0, (%rdi) 1070; SKX-NEXT: retq 1071 %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535> 1072 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535> 1073 %3 = icmp sgt <2 x i64> %2, zeroinitializer 1074 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 1075 %5 = trunc <2 x i64> %4 to <2 x i16> 1076 store <2 x i16> %5, ptr%p1 1077 ret void 1078} 1079 1080define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { 1081; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i16: 1082; SSE2-SSSE3: # %bb.0: 1083; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] 1084; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 1085; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 1086; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 1087; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] 1088; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 1089; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 1090; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] 1091; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 1092; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 1093; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,2,2] 1094; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 1095; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] 1096; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 1097; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 1098; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 1099; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 1100; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 1101; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 1102; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] 1103; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 1104; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 1105; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] 1106; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 1107; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] 1108; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 1109; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 1110; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 1111; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 1112; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 1113; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 1114; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 1115; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 1116; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 1117; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1118; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 1119; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1120; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 1121; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 1122; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 1123; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 1124; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 1125; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 1126; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 1127; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1128; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 1129; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 1130; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 1131; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 1132; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 1133; SSE2-SSSE3-NEXT: pslld $16, %xmm0 1134; SSE2-SSSE3-NEXT: psrad $16, %xmm0 1135; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 1136; SSE2-SSSE3-NEXT: retq 1137; 1138; SSE41-LABEL: trunc_packus_v4i64_v4i16: 1139; SSE41: # %bb.0: 1140; SSE41-NEXT: movdqa %xmm0, %xmm2 1141; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] 1142; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 1143; SSE41-NEXT: movdqa %xmm0, %xmm5 1144; SSE41-NEXT: pxor %xmm3, %xmm5 1145; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] 1146; SSE41-NEXT: movdqa %xmm6, %xmm7 1147; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 1148; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 1149; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 1150; SSE41-NEXT: pand %xmm5, %xmm0 1151; SSE41-NEXT: por %xmm7, %xmm0 1152; SSE41-NEXT: movapd %xmm4, %xmm5 1153; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 1154; SSE41-NEXT: movdqa %xmm1, %xmm0 1155; SSE41-NEXT: pxor %xmm3, %xmm0 1156; SSE41-NEXT: movdqa %xmm0, %xmm2 1157; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 1158; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1159; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1160; SSE41-NEXT: pand %xmm2, %xmm0 1161; SSE41-NEXT: por %xmm6, %xmm0 1162; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 1163; SSE41-NEXT: xorpd %xmm1, %xmm1 1164; SSE41-NEXT: movapd %xmm4, %xmm2 1165; SSE41-NEXT: xorpd %xmm3, %xmm2 1166; SSE41-NEXT: movapd %xmm2, %xmm6 1167; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1168; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 1169; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 1170; SSE41-NEXT: pand %xmm6, %xmm0 1171; SSE41-NEXT: por %xmm2, %xmm0 1172; SSE41-NEXT: pxor %xmm2, %xmm2 1173; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 1174; SSE41-NEXT: movapd %xmm5, %xmm4 1175; SSE41-NEXT: xorpd %xmm3, %xmm4 1176; SSE41-NEXT: movapd %xmm4, %xmm6 1177; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1178; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1179; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1180; SSE41-NEXT: pand %xmm6, %xmm0 1181; SSE41-NEXT: por %xmm4, %xmm0 1182; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 1183; SSE41-NEXT: packusdw %xmm2, %xmm1 1184; SSE41-NEXT: packusdw %xmm1, %xmm1 1185; SSE41-NEXT: movdqa %xmm1, %xmm0 1186; SSE41-NEXT: retq 1187; 1188; AVX1-LABEL: trunc_packus_v4i64_v4i16: 1189; AVX1: # %bb.0: 1190; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 1191; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1192; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 1193; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1194; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 1195; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 1196; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1197; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 1198; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 1199; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 1200; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1201; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1202; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1203; AVX1-NEXT: vzeroupper 1204; AVX1-NEXT: retq 1205; 1206; AVX2-LABEL: trunc_packus_v4i64_v4i16: 1207; AVX2: # %bb.0: 1208; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] 1209; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 1210; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 1211; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1212; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 1213; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 1214; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1215; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1216; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1217; AVX2-NEXT: vzeroupper 1218; AVX2-NEXT: retq 1219; 1220; AVX512F-LABEL: trunc_packus_v4i64_v4i16: 1221; AVX512F: # %bb.0: 1222; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1223; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1224; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1225; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1226; AVX512F-NEXT: vzeroupper 1227; AVX512F-NEXT: retq 1228; 1229; AVX512VL-LABEL: trunc_packus_v4i64_v4i16: 1230; AVX512VL: # %bb.0: 1231; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1232; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1233; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 1234; AVX512VL-NEXT: vzeroupper 1235; AVX512VL-NEXT: retq 1236; 1237; AVX512BW-LABEL: trunc_packus_v4i64_v4i16: 1238; AVX512BW: # %bb.0: 1239; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1240; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1241; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1242; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1243; AVX512BW-NEXT: vzeroupper 1244; AVX512BW-NEXT: retq 1245; 1246; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16: 1247; AVX512BWVL: # %bb.0: 1248; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1249; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1250; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 1251; AVX512BWVL-NEXT: vzeroupper 1252; AVX512BWVL-NEXT: retq 1253; 1254; SKX-LABEL: trunc_packus_v4i64_v4i16: 1255; SKX: # %bb.0: 1256; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1257; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1258; SKX-NEXT: vpmovusqw %ymm0, %xmm0 1259; SKX-NEXT: vzeroupper 1260; SKX-NEXT: retq 1261 %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535> 1262 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535> 1263 %3 = icmp sgt <4 x i64> %2, zeroinitializer 1264 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 1265 %5 = trunc <4 x i64> %4 to <4 x i16> 1266 ret <4 x i16> %5 1267} 1268 1269define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { 1270; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i16_store: 1271; SSE2-SSSE3: # %bb.0: 1272; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] 1273; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1274; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 1275; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 1276; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1277; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 1278; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 1279; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] 1280; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 1281; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 1282; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] 1283; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 1284; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] 1285; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 1286; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 1287; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 1288; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 1289; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 1290; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 1291; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1292; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 1293; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 1294; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 1295; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 1296; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1297; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 1298; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 1299; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 1300; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 1301; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 1302; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 1303; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 1304; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 1305; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1306; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1307; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 1308; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1309; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 1310; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 1311; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 1312; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 1313; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 1314; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 1315; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1316; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1317; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 1318; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1319; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 1320; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 1321; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 1322; SSE2-SSSE3-NEXT: pslld $16, %xmm2 1323; SSE2-SSSE3-NEXT: psrad $16, %xmm2 1324; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm2 1325; SSE2-SSSE3-NEXT: movq %xmm2, (%rdi) 1326; SSE2-SSSE3-NEXT: retq 1327; 1328; SSE41-LABEL: trunc_packus_v4i64_v4i16_store: 1329; SSE41: # %bb.0: 1330; SSE41-NEXT: movdqa %xmm0, %xmm2 1331; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] 1332; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 1333; SSE41-NEXT: movdqa %xmm0, %xmm5 1334; SSE41-NEXT: pxor %xmm3, %xmm5 1335; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] 1336; SSE41-NEXT: movdqa %xmm6, %xmm7 1337; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 1338; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 1339; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 1340; SSE41-NEXT: pand %xmm5, %xmm0 1341; SSE41-NEXT: por %xmm7, %xmm0 1342; SSE41-NEXT: movapd %xmm4, %xmm5 1343; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 1344; SSE41-NEXT: movdqa %xmm1, %xmm0 1345; SSE41-NEXT: pxor %xmm3, %xmm0 1346; SSE41-NEXT: movdqa %xmm0, %xmm2 1347; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 1348; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1349; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1350; SSE41-NEXT: pand %xmm2, %xmm0 1351; SSE41-NEXT: por %xmm6, %xmm0 1352; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 1353; SSE41-NEXT: xorpd %xmm1, %xmm1 1354; SSE41-NEXT: movapd %xmm4, %xmm2 1355; SSE41-NEXT: xorpd %xmm3, %xmm2 1356; SSE41-NEXT: movapd %xmm2, %xmm6 1357; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1358; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 1359; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 1360; SSE41-NEXT: pand %xmm6, %xmm0 1361; SSE41-NEXT: por %xmm2, %xmm0 1362; SSE41-NEXT: pxor %xmm2, %xmm2 1363; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 1364; SSE41-NEXT: movapd %xmm5, %xmm4 1365; SSE41-NEXT: xorpd %xmm3, %xmm4 1366; SSE41-NEXT: movapd %xmm4, %xmm6 1367; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1368; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1369; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1370; SSE41-NEXT: pand %xmm6, %xmm0 1371; SSE41-NEXT: por %xmm4, %xmm0 1372; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 1373; SSE41-NEXT: packusdw %xmm2, %xmm1 1374; SSE41-NEXT: packusdw %xmm1, %xmm1 1375; SSE41-NEXT: movq %xmm1, (%rdi) 1376; SSE41-NEXT: retq 1377; 1378; AVX1-LABEL: trunc_packus_v4i64_v4i16_store: 1379; AVX1: # %bb.0: 1380; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] 1381; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1382; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 1383; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1384; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 1385; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 1386; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1387; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 1388; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 1389; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 1390; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1391; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1392; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1393; AVX1-NEXT: vmovq %xmm0, (%rdi) 1394; AVX1-NEXT: vzeroupper 1395; AVX1-NEXT: retq 1396; 1397; AVX2-LABEL: trunc_packus_v4i64_v4i16_store: 1398; AVX2: # %bb.0: 1399; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] 1400; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 1401; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 1402; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1403; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 1404; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 1405; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1406; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1407; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1408; AVX2-NEXT: vmovq %xmm0, (%rdi) 1409; AVX2-NEXT: vzeroupper 1410; AVX2-NEXT: retq 1411; 1412; AVX512F-LABEL: trunc_packus_v4i64_v4i16_store: 1413; AVX512F: # %bb.0: 1414; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1415; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1416; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1417; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1418; AVX512F-NEXT: vmovq %xmm0, (%rdi) 1419; AVX512F-NEXT: vzeroupper 1420; AVX512F-NEXT: retq 1421; 1422; AVX512VL-LABEL: trunc_packus_v4i64_v4i16_store: 1423; AVX512VL: # %bb.0: 1424; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1425; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1426; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) 1427; AVX512VL-NEXT: vzeroupper 1428; AVX512VL-NEXT: retq 1429; 1430; AVX512BW-LABEL: trunc_packus_v4i64_v4i16_store: 1431; AVX512BW: # %bb.0: 1432; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1433; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1434; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1435; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1436; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 1437; AVX512BW-NEXT: vzeroupper 1438; AVX512BW-NEXT: retq 1439; 1440; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16_store: 1441; AVX512BWVL: # %bb.0: 1442; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1443; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1444; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) 1445; AVX512BWVL-NEXT: vzeroupper 1446; AVX512BWVL-NEXT: retq 1447; 1448; SKX-LABEL: trunc_packus_v4i64_v4i16_store: 1449; SKX: # %bb.0: 1450; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1451; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1452; SKX-NEXT: vpmovusqw %ymm0, (%rdi) 1453; SKX-NEXT: vzeroupper 1454; SKX-NEXT: retq 1455 %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535> 1456 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535> 1457 %3 = icmp sgt <4 x i64> %2, zeroinitializer 1458 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 1459 %5 = trunc <4 x i64> %4 to <4 x i16> 1460 store <4 x i16> %5, ptr%p1 1461 ret void 1462} 1463 1464define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { 1465; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i16: 1466; SSE2-SSSE3: # %bb.0: 1467; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 1468; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 1469; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 1470; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 1471; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] 1472; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 1473; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 1474; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 1475; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] 1476; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 1477; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 1478; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] 1479; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 1480; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 1481; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 1482; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 1483; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] 1484; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 1485; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 1486; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 1487; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 1488; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 1489; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 1490; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] 1491; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 1492; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 1493; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 1494; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 1495; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 1496; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] 1497; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 1498; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 1499; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 1500; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 1501; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 1502; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 1503; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 1504; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 1505; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 1506; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 1507; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 1508; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 1509; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] 1510; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 1511; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 1512; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 1513; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 1514; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 1515; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 1516; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 1517; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 1518; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 1519; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] 1520; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 1521; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1522; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 1523; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 1524; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 1525; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 1526; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 1527; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 1528; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 1529; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 1530; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 1531; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1532; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 1533; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1534; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 1535; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 1536; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 1537; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 1538; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 1539; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 1540; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 1541; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 1542; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 1543; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] 1544; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 1545; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 1546; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 1547; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 1548; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 1549; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 1550; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 1551; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 1552; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1553; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 1554; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1555; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 1556; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 1557; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 1558; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 1559; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 1560; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 1561; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 1562; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 1563; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 1564; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 1565; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 1566; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 1567; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] 1568; SSE2-SSSE3-NEXT: pslld $16, %xmm3 1569; SSE2-SSSE3-NEXT: psrad $16, %xmm3 1570; SSE2-SSSE3-NEXT: pslld $16, %xmm0 1571; SSE2-SSSE3-NEXT: psrad $16, %xmm0 1572; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 1573; SSE2-SSSE3-NEXT: retq 1574; 1575; SSE41-LABEL: trunc_packus_v8i64_v8i16: 1576; SSE41: # %bb.0: 1577; SSE41-NEXT: movdqa (%rdi), %xmm7 1578; SSE41-NEXT: movdqa 16(%rdi), %xmm5 1579; SSE41-NEXT: movdqa 32(%rdi), %xmm4 1580; SSE41-NEXT: movdqa 48(%rdi), %xmm8 1581; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] 1582; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 1583; SSE41-NEXT: movdqa %xmm4, %xmm3 1584; SSE41-NEXT: pxor %xmm2, %xmm3 1585; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] 1586; SSE41-NEXT: movdqa %xmm6, %xmm9 1587; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 1588; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 1589; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 1590; SSE41-NEXT: pand %xmm3, %xmm0 1591; SSE41-NEXT: por %xmm9, %xmm0 1592; SSE41-NEXT: movapd %xmm1, %xmm3 1593; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 1594; SSE41-NEXT: movdqa %xmm8, %xmm4 1595; SSE41-NEXT: pxor %xmm2, %xmm4 1596; SSE41-NEXT: movdqa %xmm6, %xmm9 1597; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 1598; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 1599; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 1600; SSE41-NEXT: pand %xmm4, %xmm0 1601; SSE41-NEXT: por %xmm9, %xmm0 1602; SSE41-NEXT: movapd %xmm1, %xmm4 1603; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 1604; SSE41-NEXT: movdqa %xmm7, %xmm8 1605; SSE41-NEXT: pxor %xmm2, %xmm8 1606; SSE41-NEXT: movdqa %xmm6, %xmm9 1607; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 1608; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 1609; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 1610; SSE41-NEXT: pand %xmm8, %xmm0 1611; SSE41-NEXT: por %xmm9, %xmm0 1612; SSE41-NEXT: movapd %xmm1, %xmm8 1613; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 1614; SSE41-NEXT: movdqa %xmm5, %xmm0 1615; SSE41-NEXT: pxor %xmm2, %xmm0 1616; SSE41-NEXT: movdqa %xmm0, %xmm7 1617; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 1618; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1619; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1620; SSE41-NEXT: pand %xmm7, %xmm0 1621; SSE41-NEXT: por %xmm6, %xmm0 1622; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 1623; SSE41-NEXT: xorpd %xmm5, %xmm5 1624; SSE41-NEXT: movapd %xmm1, %xmm6 1625; SSE41-NEXT: xorpd %xmm2, %xmm6 1626; SSE41-NEXT: movapd %xmm6, %xmm7 1627; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 1628; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 1629; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1630; SSE41-NEXT: pand %xmm7, %xmm0 1631; SSE41-NEXT: por %xmm6, %xmm0 1632; SSE41-NEXT: pxor %xmm6, %xmm6 1633; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 1634; SSE41-NEXT: movapd %xmm8, %xmm1 1635; SSE41-NEXT: xorpd %xmm2, %xmm1 1636; SSE41-NEXT: movapd %xmm1, %xmm7 1637; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 1638; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 1639; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 1640; SSE41-NEXT: pand %xmm7, %xmm0 1641; SSE41-NEXT: por %xmm1, %xmm0 1642; SSE41-NEXT: pxor %xmm1, %xmm1 1643; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 1644; SSE41-NEXT: packusdw %xmm6, %xmm1 1645; SSE41-NEXT: movapd %xmm4, %xmm6 1646; SSE41-NEXT: xorpd %xmm2, %xmm6 1647; SSE41-NEXT: movapd %xmm6, %xmm7 1648; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 1649; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 1650; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1651; SSE41-NEXT: pand %xmm7, %xmm0 1652; SSE41-NEXT: por %xmm6, %xmm0 1653; SSE41-NEXT: pxor %xmm6, %xmm6 1654; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 1655; SSE41-NEXT: movapd %xmm3, %xmm4 1656; SSE41-NEXT: xorpd %xmm2, %xmm4 1657; SSE41-NEXT: movapd %xmm4, %xmm7 1658; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 1659; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 1660; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1661; SSE41-NEXT: pand %xmm7, %xmm0 1662; SSE41-NEXT: por %xmm4, %xmm0 1663; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 1664; SSE41-NEXT: packusdw %xmm6, %xmm5 1665; SSE41-NEXT: packusdw %xmm5, %xmm1 1666; SSE41-NEXT: movdqa %xmm1, %xmm0 1667; SSE41-NEXT: retq 1668; 1669; AVX1-LABEL: trunc_packus_v8i64_v8i16: 1670; AVX1: # %bb.0: 1671; AVX1-NEXT: vmovdqa (%rdi), %xmm0 1672; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 1673; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 1674; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 1675; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = [65535,65535] 1676; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 1677; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 1678; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 1679; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 1680; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 1681; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 1682; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 1683; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 1684; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1685; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 1686; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 1687; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 1688; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 1689; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1690; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 1691; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1692; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 1693; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 1694; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1695; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1696; AVX1-NEXT: retq 1697; 1698; AVX2-LABEL: trunc_packus_v8i64_v8i16: 1699; AVX2: # %bb.0: 1700; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1701; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1702; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] 1703; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 1704; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 1705; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 1706; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 1707; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1708; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 1709; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 1710; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 1711; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 1712; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1713; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1714; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1715; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1716; AVX2-NEXT: vzeroupper 1717; AVX2-NEXT: retq 1718; 1719; AVX512-LABEL: trunc_packus_v8i64_v8i16: 1720; AVX512: # %bb.0: 1721; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1722; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 1723; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 1724; AVX512-NEXT: vzeroupper 1725; AVX512-NEXT: retq 1726; 1727; SKX-LABEL: trunc_packus_v8i64_v8i16: 1728; SKX: # %bb.0: 1729; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1730; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 1731; SKX-NEXT: vpmovusqw %ymm1, %xmm1 1732; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 1733; SKX-NEXT: vpmovusqw %ymm0, %xmm0 1734; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1735; SKX-NEXT: vzeroupper 1736; SKX-NEXT: retq 1737 %a0 = load <8 x i64>, ptr %p0 1738 %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> 1739 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> 1740 %3 = icmp sgt <8 x i64> %2, zeroinitializer 1741 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1742 %5 = trunc <8 x i64> %4 to <8 x i16> 1743 ret <8 x i16> %5 1744} 1745 1746define <4 x i16> @trunc_packus_v4i32_v4i16(<4 x i32> %a0) { 1747; SSE2-LABEL: trunc_packus_v4i32_v4i16: 1748; SSE2: # %bb.0: 1749; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1750; SSE2-NEXT: movdqa %xmm1, %xmm2 1751; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 1752; SSE2-NEXT: pand %xmm2, %xmm0 1753; SSE2-NEXT: pandn %xmm1, %xmm2 1754; SSE2-NEXT: por %xmm0, %xmm2 1755; SSE2-NEXT: pxor %xmm0, %xmm0 1756; SSE2-NEXT: movdqa %xmm2, %xmm1 1757; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 1758; SSE2-NEXT: pand %xmm2, %xmm1 1759; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 1760; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1761; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1762; SSE2-NEXT: retq 1763; 1764; SSSE3-LABEL: trunc_packus_v4i32_v4i16: 1765; SSSE3: # %bb.0: 1766; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1767; SSSE3-NEXT: movdqa %xmm1, %xmm2 1768; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 1769; SSSE3-NEXT: pand %xmm2, %xmm0 1770; SSSE3-NEXT: pandn %xmm1, %xmm2 1771; SSSE3-NEXT: por %xmm2, %xmm0 1772; SSSE3-NEXT: pxor %xmm1, %xmm1 1773; SSSE3-NEXT: movdqa %xmm0, %xmm2 1774; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 1775; SSSE3-NEXT: pand %xmm2, %xmm0 1776; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1777; SSSE3-NEXT: retq 1778; 1779; SSE41-LABEL: trunc_packus_v4i32_v4i16: 1780; SSE41: # %bb.0: 1781; SSE41-NEXT: packusdw %xmm0, %xmm0 1782; SSE41-NEXT: retq 1783; 1784; AVX-LABEL: trunc_packus_v4i32_v4i16: 1785; AVX: # %bb.0: 1786; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1787; AVX-NEXT: retq 1788; 1789; AVX512-LABEL: trunc_packus_v4i32_v4i16: 1790; AVX512: # %bb.0: 1791; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1792; AVX512-NEXT: retq 1793; 1794; SKX-LABEL: trunc_packus_v4i32_v4i16: 1795; SKX: # %bb.0: 1796; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1797; SKX-NEXT: retq 1798 %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> 1799 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> 1800 %3 = icmp sgt <4 x i32> %2, zeroinitializer 1801 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 1802 %5 = trunc <4 x i32> %4 to <4 x i16> 1803 ret <4 x i16> %5 1804} 1805 1806define void @trunc_packus_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { 1807; SSE2-LABEL: trunc_packus_v4i32_v4i16_store: 1808; SSE2: # %bb.0: 1809; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1810; SSE2-NEXT: movdqa %xmm1, %xmm2 1811; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 1812; SSE2-NEXT: pand %xmm2, %xmm0 1813; SSE2-NEXT: pandn %xmm1, %xmm2 1814; SSE2-NEXT: por %xmm0, %xmm2 1815; SSE2-NEXT: pxor %xmm0, %xmm0 1816; SSE2-NEXT: movdqa %xmm2, %xmm1 1817; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 1818; SSE2-NEXT: pand %xmm2, %xmm1 1819; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 1820; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1821; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1822; SSE2-NEXT: movq %xmm0, (%rdi) 1823; SSE2-NEXT: retq 1824; 1825; SSSE3-LABEL: trunc_packus_v4i32_v4i16_store: 1826; SSSE3: # %bb.0: 1827; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1828; SSSE3-NEXT: movdqa %xmm1, %xmm2 1829; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 1830; SSSE3-NEXT: pand %xmm2, %xmm0 1831; SSSE3-NEXT: pandn %xmm1, %xmm2 1832; SSSE3-NEXT: por %xmm0, %xmm2 1833; SSSE3-NEXT: pxor %xmm0, %xmm0 1834; SSSE3-NEXT: movdqa %xmm2, %xmm1 1835; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 1836; SSSE3-NEXT: pand %xmm2, %xmm1 1837; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1838; SSSE3-NEXT: movq %xmm1, (%rdi) 1839; SSSE3-NEXT: retq 1840; 1841; SSE41-LABEL: trunc_packus_v4i32_v4i16_store: 1842; SSE41: # %bb.0: 1843; SSE41-NEXT: packusdw %xmm0, %xmm0 1844; SSE41-NEXT: movq %xmm0, (%rdi) 1845; SSE41-NEXT: retq 1846; 1847; AVX-LABEL: trunc_packus_v4i32_v4i16_store: 1848; AVX: # %bb.0: 1849; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1850; AVX-NEXT: vmovq %xmm0, (%rdi) 1851; AVX-NEXT: retq 1852; 1853; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store: 1854; AVX512F: # %bb.0: 1855; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1856; AVX512F-NEXT: vmovq %xmm0, (%rdi) 1857; AVX512F-NEXT: retq 1858; 1859; AVX512VL-LABEL: trunc_packus_v4i32_v4i16_store: 1860; AVX512VL: # %bb.0: 1861; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1862; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 1863; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) 1864; AVX512VL-NEXT: retq 1865; 1866; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store: 1867; AVX512BW: # %bb.0: 1868; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1869; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 1870; AVX512BW-NEXT: retq 1871; 1872; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16_store: 1873; AVX512BWVL: # %bb.0: 1874; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1875; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 1876; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) 1877; AVX512BWVL-NEXT: retq 1878; 1879; SKX-LABEL: trunc_packus_v4i32_v4i16_store: 1880; SKX: # %bb.0: 1881; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1882; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 1883; SKX-NEXT: vpmovusdw %xmm0, (%rdi) 1884; SKX-NEXT: retq 1885 %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> 1886 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> 1887 %3 = icmp sgt <4 x i32> %2, zeroinitializer 1888 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 1889 %5 = trunc <4 x i32> %4 to <4 x i16> 1890 store <4 x i16> %5, ptr%p1 1891 ret void 1892} 1893 1894define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { 1895; SSE2-LABEL: trunc_packus_v8i32_v8i16: 1896; SSE2: # %bb.0: 1897; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 1898; SSE2-NEXT: movdqa %xmm2, %xmm3 1899; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 1900; SSE2-NEXT: pand %xmm3, %xmm1 1901; SSE2-NEXT: pandn %xmm2, %xmm3 1902; SSE2-NEXT: por %xmm1, %xmm3 1903; SSE2-NEXT: movdqa %xmm2, %xmm1 1904; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 1905; SSE2-NEXT: pand %xmm1, %xmm0 1906; SSE2-NEXT: pandn %xmm2, %xmm1 1907; SSE2-NEXT: por %xmm1, %xmm0 1908; SSE2-NEXT: pxor %xmm1, %xmm1 1909; SSE2-NEXT: movdqa %xmm0, %xmm2 1910; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 1911; SSE2-NEXT: pand %xmm2, %xmm0 1912; SSE2-NEXT: movdqa %xmm3, %xmm2 1913; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 1914; SSE2-NEXT: pand %xmm3, %xmm2 1915; SSE2-NEXT: pslld $16, %xmm2 1916; SSE2-NEXT: psrad $16, %xmm2 1917; SSE2-NEXT: pslld $16, %xmm0 1918; SSE2-NEXT: psrad $16, %xmm0 1919; SSE2-NEXT: packssdw %xmm2, %xmm0 1920; SSE2-NEXT: retq 1921; 1922; SSSE3-LABEL: trunc_packus_v8i32_v8i16: 1923; SSSE3: # %bb.0: 1924; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 1925; SSSE3-NEXT: movdqa %xmm2, %xmm3 1926; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 1927; SSSE3-NEXT: pand %xmm3, %xmm1 1928; SSSE3-NEXT: pandn %xmm2, %xmm3 1929; SSSE3-NEXT: por %xmm1, %xmm3 1930; SSSE3-NEXT: movdqa %xmm2, %xmm1 1931; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 1932; SSSE3-NEXT: pand %xmm1, %xmm0 1933; SSSE3-NEXT: pandn %xmm2, %xmm1 1934; SSSE3-NEXT: por %xmm1, %xmm0 1935; SSSE3-NEXT: pxor %xmm1, %xmm1 1936; SSSE3-NEXT: movdqa %xmm0, %xmm2 1937; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 1938; SSSE3-NEXT: pand %xmm2, %xmm0 1939; SSSE3-NEXT: movdqa %xmm3, %xmm2 1940; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 1941; SSSE3-NEXT: pand %xmm3, %xmm2 1942; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1943; SSSE3-NEXT: pshufb %xmm1, %xmm2 1944; SSSE3-NEXT: pshufb %xmm1, %xmm0 1945; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1946; SSSE3-NEXT: retq 1947; 1948; SSE41-LABEL: trunc_packus_v8i32_v8i16: 1949; SSE41: # %bb.0: 1950; SSE41-NEXT: packusdw %xmm1, %xmm0 1951; SSE41-NEXT: retq 1952; 1953; AVX1-LABEL: trunc_packus_v8i32_v8i16: 1954; AVX1: # %bb.0: 1955; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1956; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1957; AVX1-NEXT: vzeroupper 1958; AVX1-NEXT: retq 1959; 1960; AVX2-LABEL: trunc_packus_v8i32_v8i16: 1961; AVX2: # %bb.0: 1962; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1963; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1964; AVX2-NEXT: vzeroupper 1965; AVX2-NEXT: retq 1966; 1967; AVX512F-LABEL: trunc_packus_v8i32_v8i16: 1968; AVX512F: # %bb.0: 1969; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1970; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1971; AVX512F-NEXT: vzeroupper 1972; AVX512F-NEXT: retq 1973; 1974; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: 1975; AVX512VL: # %bb.0: 1976; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1977; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1978; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 1979; AVX512VL-NEXT: vzeroupper 1980; AVX512VL-NEXT: retq 1981; 1982; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: 1983; AVX512BW: # %bb.0: 1984; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1985; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1986; AVX512BW-NEXT: vzeroupper 1987; AVX512BW-NEXT: retq 1988; 1989; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: 1990; AVX512BWVL: # %bb.0: 1991; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1992; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1993; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 1994; AVX512BWVL-NEXT: vzeroupper 1995; AVX512BWVL-NEXT: retq 1996; 1997; SKX-LABEL: trunc_packus_v8i32_v8i16: 1998; SKX: # %bb.0: 1999; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2000; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 2001; SKX-NEXT: vpmovusdw %ymm0, %xmm0 2002; SKX-NEXT: vzeroupper 2003; SKX-NEXT: retq 2004 %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2005 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2006 %3 = icmp sgt <8 x i32> %2, zeroinitializer 2007 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 2008 %5 = trunc <8 x i32> %4 to <8 x i16> 2009 ret <8 x i16> %5 2010} 2011 2012define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" { 2013; SSE2-LABEL: trunc_packus_v16i32_v16i16: 2014; SSE2: # %bb.0: 2015; SSE2-NEXT: movdqa (%rdi), %xmm1 2016; SSE2-NEXT: movdqa 16(%rdi), %xmm3 2017; SSE2-NEXT: movdqa 32(%rdi), %xmm0 2018; SSE2-NEXT: movdqa 48(%rdi), %xmm4 2019; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 2020; SSE2-NEXT: movdqa %xmm5, %xmm2 2021; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 2022; SSE2-NEXT: pand %xmm2, %xmm3 2023; SSE2-NEXT: pandn %xmm5, %xmm2 2024; SSE2-NEXT: por %xmm3, %xmm2 2025; SSE2-NEXT: movdqa %xmm5, %xmm3 2026; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 2027; SSE2-NEXT: pand %xmm3, %xmm1 2028; SSE2-NEXT: pandn %xmm5, %xmm3 2029; SSE2-NEXT: por %xmm1, %xmm3 2030; SSE2-NEXT: movdqa %xmm5, %xmm6 2031; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 2032; SSE2-NEXT: pand %xmm6, %xmm4 2033; SSE2-NEXT: pandn %xmm5, %xmm6 2034; SSE2-NEXT: por %xmm4, %xmm6 2035; SSE2-NEXT: movdqa %xmm5, %xmm4 2036; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 2037; SSE2-NEXT: pand %xmm4, %xmm0 2038; SSE2-NEXT: pandn %xmm5, %xmm4 2039; SSE2-NEXT: por %xmm0, %xmm4 2040; SSE2-NEXT: pxor %xmm5, %xmm5 2041; SSE2-NEXT: movdqa %xmm4, %xmm1 2042; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 2043; SSE2-NEXT: pand %xmm4, %xmm1 2044; SSE2-NEXT: movdqa %xmm6, %xmm4 2045; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 2046; SSE2-NEXT: pand %xmm6, %xmm4 2047; SSE2-NEXT: movdqa %xmm3, %xmm0 2048; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 2049; SSE2-NEXT: pand %xmm3, %xmm0 2050; SSE2-NEXT: movdqa %xmm2, %xmm3 2051; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 2052; SSE2-NEXT: pand %xmm2, %xmm3 2053; SSE2-NEXT: pslld $16, %xmm3 2054; SSE2-NEXT: psrad $16, %xmm3 2055; SSE2-NEXT: pslld $16, %xmm0 2056; SSE2-NEXT: psrad $16, %xmm0 2057; SSE2-NEXT: packssdw %xmm3, %xmm0 2058; SSE2-NEXT: pslld $16, %xmm4 2059; SSE2-NEXT: psrad $16, %xmm4 2060; SSE2-NEXT: pslld $16, %xmm1 2061; SSE2-NEXT: psrad $16, %xmm1 2062; SSE2-NEXT: packssdw %xmm4, %xmm1 2063; SSE2-NEXT: retq 2064; 2065; SSSE3-LABEL: trunc_packus_v16i32_v16i16: 2066; SSSE3: # %bb.0: 2067; SSSE3-NEXT: movdqa (%rdi), %xmm1 2068; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 2069; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 2070; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 2071; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 2072; SSSE3-NEXT: movdqa %xmm5, %xmm2 2073; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 2074; SSSE3-NEXT: pand %xmm2, %xmm3 2075; SSSE3-NEXT: pandn %xmm5, %xmm2 2076; SSSE3-NEXT: por %xmm3, %xmm2 2077; SSSE3-NEXT: movdqa %xmm5, %xmm3 2078; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 2079; SSSE3-NEXT: pand %xmm3, %xmm1 2080; SSSE3-NEXT: pandn %xmm5, %xmm3 2081; SSSE3-NEXT: por %xmm1, %xmm3 2082; SSSE3-NEXT: movdqa %xmm5, %xmm6 2083; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 2084; SSSE3-NEXT: pand %xmm6, %xmm4 2085; SSSE3-NEXT: pandn %xmm5, %xmm6 2086; SSSE3-NEXT: por %xmm4, %xmm6 2087; SSSE3-NEXT: movdqa %xmm5, %xmm4 2088; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 2089; SSSE3-NEXT: pand %xmm4, %xmm0 2090; SSSE3-NEXT: pandn %xmm5, %xmm4 2091; SSSE3-NEXT: por %xmm0, %xmm4 2092; SSSE3-NEXT: pxor %xmm5, %xmm5 2093; SSSE3-NEXT: movdqa %xmm4, %xmm1 2094; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 2095; SSSE3-NEXT: pand %xmm4, %xmm1 2096; SSSE3-NEXT: movdqa %xmm6, %xmm4 2097; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 2098; SSSE3-NEXT: pand %xmm6, %xmm4 2099; SSSE3-NEXT: movdqa %xmm3, %xmm0 2100; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 2101; SSSE3-NEXT: pand %xmm3, %xmm0 2102; SSSE3-NEXT: movdqa %xmm2, %xmm3 2103; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 2104; SSSE3-NEXT: pand %xmm2, %xmm3 2105; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2106; SSSE3-NEXT: pshufb %xmm2, %xmm3 2107; SSSE3-NEXT: pshufb %xmm2, %xmm0 2108; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2109; SSSE3-NEXT: pshufb %xmm2, %xmm4 2110; SSSE3-NEXT: pshufb %xmm2, %xmm1 2111; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] 2112; SSSE3-NEXT: retq 2113; 2114; SSE41-LABEL: trunc_packus_v16i32_v16i16: 2115; SSE41: # %bb.0: 2116; SSE41-NEXT: movdqa (%rdi), %xmm0 2117; SSE41-NEXT: movdqa 32(%rdi), %xmm1 2118; SSE41-NEXT: packusdw 16(%rdi), %xmm0 2119; SSE41-NEXT: packusdw 48(%rdi), %xmm1 2120; SSE41-NEXT: retq 2121; 2122; AVX1-LABEL: trunc_packus_v16i32_v16i16: 2123; AVX1: # %bb.0: 2124; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2125; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 2126; AVX1-NEXT: vpackusdw 48(%rdi), %xmm1, %xmm1 2127; AVX1-NEXT: vpackusdw 16(%rdi), %xmm0, %xmm0 2128; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2129; AVX1-NEXT: retq 2130; 2131; AVX2-LABEL: trunc_packus_v16i32_v16i16: 2132; AVX2: # %bb.0: 2133; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2134; AVX2-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 2135; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2136; AVX2-NEXT: retq 2137; 2138; AVX512-LABEL: trunc_packus_v16i32_v16i16: 2139; AVX512: # %bb.0: 2140; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 2141; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 2142; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 2143; AVX512-NEXT: retq 2144; 2145; SKX-LABEL: trunc_packus_v16i32_v16i16: 2146; SKX: # %bb.0: 2147; SKX-NEXT: vmovdqa (%rdi), %ymm0 2148; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 2149; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2150; SKX-NEXT: retq 2151 %a0 = load <16 x i32>, ptr %p0 2152 %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2153 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2154 %3 = icmp sgt <16 x i32> %2, zeroinitializer 2155 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2156 %5 = trunc <16 x i32> %4 to <16 x i16> 2157 ret <16 x i16> %5 2158} 2159 2160; 2161; PACKUS saturation truncation to vXi8 2162; 2163 2164define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { 2165; SSE2-LABEL: trunc_packus_v2i64_v2i8: 2166; SSE2: # %bb.0: 2167; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2168; SSE2-NEXT: movdqa %xmm0, %xmm2 2169; SSE2-NEXT: pxor %xmm1, %xmm2 2170; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2171; SSE2-NEXT: pxor %xmm4, %xmm4 2172; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2173; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2174; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 2175; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2176; SSE2-NEXT: pand %xmm4, %xmm2 2177; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2178; SSE2-NEXT: por %xmm2, %xmm3 2179; SSE2-NEXT: pand %xmm3, %xmm0 2180; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2181; SSE2-NEXT: por %xmm3, %xmm0 2182; SSE2-NEXT: movdqa %xmm0, %xmm2 2183; SSE2-NEXT: pxor %xmm1, %xmm2 2184; SSE2-NEXT: movdqa %xmm2, %xmm3 2185; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 2186; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 2187; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 2188; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2189; SSE2-NEXT: pand %xmm4, %xmm1 2190; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 2191; SSE2-NEXT: por %xmm1, %xmm2 2192; SSE2-NEXT: pand %xmm2, %xmm0 2193; SSE2-NEXT: packuswb %xmm0, %xmm0 2194; SSE2-NEXT: packuswb %xmm0, %xmm0 2195; SSE2-NEXT: packuswb %xmm0, %xmm0 2196; SSE2-NEXT: retq 2197; 2198; SSSE3-LABEL: trunc_packus_v2i64_v2i8: 2199; SSSE3: # %bb.0: 2200; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2201; SSSE3-NEXT: movdqa %xmm0, %xmm2 2202; SSSE3-NEXT: pxor %xmm1, %xmm2 2203; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2204; SSSE3-NEXT: pxor %xmm4, %xmm4 2205; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 2206; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2207; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 2208; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2209; SSSE3-NEXT: pand %xmm4, %xmm2 2210; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2211; SSSE3-NEXT: por %xmm2, %xmm3 2212; SSSE3-NEXT: pand %xmm3, %xmm0 2213; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2214; SSSE3-NEXT: por %xmm3, %xmm0 2215; SSSE3-NEXT: movdqa %xmm0, %xmm2 2216; SSSE3-NEXT: pxor %xmm1, %xmm2 2217; SSSE3-NEXT: movdqa %xmm2, %xmm3 2218; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 2219; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 2220; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 2221; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2222; SSSE3-NEXT: pand %xmm4, %xmm1 2223; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 2224; SSSE3-NEXT: por %xmm1, %xmm2 2225; SSSE3-NEXT: pand %xmm2, %xmm0 2226; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2227; SSSE3-NEXT: retq 2228; 2229; SSE41-LABEL: trunc_packus_v2i64_v2i8: 2230; SSE41: # %bb.0: 2231; SSE41-NEXT: movdqa %xmm0, %xmm1 2232; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] 2233; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 2234; SSE41-NEXT: pxor %xmm3, %xmm0 2235; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] 2236; SSE41-NEXT: movdqa %xmm0, %xmm5 2237; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 2238; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 2239; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2240; SSE41-NEXT: pand %xmm5, %xmm0 2241; SSE41-NEXT: por %xmm4, %xmm0 2242; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 2243; SSE41-NEXT: xorpd %xmm1, %xmm1 2244; SSE41-NEXT: movapd %xmm2, %xmm4 2245; SSE41-NEXT: xorpd %xmm3, %xmm4 2246; SSE41-NEXT: movapd %xmm4, %xmm5 2247; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 2248; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2249; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2250; SSE41-NEXT: pand %xmm5, %xmm0 2251; SSE41-NEXT: por %xmm4, %xmm0 2252; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 2253; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2254; SSE41-NEXT: movdqa %xmm1, %xmm0 2255; SSE41-NEXT: retq 2256; 2257; AVX-LABEL: trunc_packus_v2i64_v2i8: 2258; AVX: # %bb.0: 2259; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] 2260; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2261; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 2262; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2263; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 2264; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 2265; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2266; AVX-NEXT: retq 2267; 2268; AVX512F-LABEL: trunc_packus_v2i64_v2i8: 2269; AVX512F: # %bb.0: 2270; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2271; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2272; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2273; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2274; AVX512F-NEXT: vzeroupper 2275; AVX512F-NEXT: retq 2276; 2277; AVX512VL-LABEL: trunc_packus_v2i64_v2i8: 2278; AVX512VL: # %bb.0: 2279; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2280; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2281; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 2282; AVX512VL-NEXT: retq 2283; 2284; AVX512BW-LABEL: trunc_packus_v2i64_v2i8: 2285; AVX512BW: # %bb.0: 2286; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2287; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2288; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2289; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2290; AVX512BW-NEXT: vzeroupper 2291; AVX512BW-NEXT: retq 2292; 2293; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8: 2294; AVX512BWVL: # %bb.0: 2295; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2296; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2297; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 2298; AVX512BWVL-NEXT: retq 2299; 2300; SKX-LABEL: trunc_packus_v2i64_v2i8: 2301; SKX: # %bb.0: 2302; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2303; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2304; SKX-NEXT: vpmovusqb %xmm0, %xmm0 2305; SKX-NEXT: retq 2306 %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255> 2307 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255> 2308 %3 = icmp sgt <2 x i64> %2, zeroinitializer 2309 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 2310 %5 = trunc <2 x i64> %4 to <2 x i8> 2311 ret <2 x i8> %5 2312} 2313 2314define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { 2315; SSE2-LABEL: trunc_packus_v2i64_v2i8_store: 2316; SSE2: # %bb.0: 2317; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2318; SSE2-NEXT: movdqa %xmm0, %xmm2 2319; SSE2-NEXT: pxor %xmm1, %xmm2 2320; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2321; SSE2-NEXT: pxor %xmm4, %xmm4 2322; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2323; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2324; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 2325; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2326; SSE2-NEXT: pand %xmm4, %xmm2 2327; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2328; SSE2-NEXT: por %xmm2, %xmm3 2329; SSE2-NEXT: pand %xmm3, %xmm0 2330; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2331; SSE2-NEXT: por %xmm0, %xmm3 2332; SSE2-NEXT: movdqa %xmm3, %xmm0 2333; SSE2-NEXT: pxor %xmm1, %xmm0 2334; SSE2-NEXT: movdqa %xmm0, %xmm2 2335; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 2336; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 2337; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 2338; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2339; SSE2-NEXT: pand %xmm4, %xmm0 2340; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2341; SSE2-NEXT: por %xmm0, %xmm1 2342; SSE2-NEXT: pand %xmm3, %xmm1 2343; SSE2-NEXT: packuswb %xmm1, %xmm1 2344; SSE2-NEXT: packuswb %xmm1, %xmm1 2345; SSE2-NEXT: packuswb %xmm1, %xmm1 2346; SSE2-NEXT: movd %xmm1, %eax 2347; SSE2-NEXT: movw %ax, (%rdi) 2348; SSE2-NEXT: retq 2349; 2350; SSSE3-LABEL: trunc_packus_v2i64_v2i8_store: 2351; SSSE3: # %bb.0: 2352; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2353; SSSE3-NEXT: movdqa %xmm0, %xmm2 2354; SSSE3-NEXT: pxor %xmm1, %xmm2 2355; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2356; SSSE3-NEXT: pxor %xmm4, %xmm4 2357; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 2358; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2359; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 2360; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2361; SSSE3-NEXT: pand %xmm4, %xmm2 2362; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2363; SSSE3-NEXT: por %xmm2, %xmm3 2364; SSSE3-NEXT: pand %xmm3, %xmm0 2365; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2366; SSSE3-NEXT: por %xmm0, %xmm3 2367; SSSE3-NEXT: movdqa %xmm3, %xmm0 2368; SSSE3-NEXT: pxor %xmm1, %xmm0 2369; SSSE3-NEXT: movdqa %xmm0, %xmm2 2370; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 2371; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 2372; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 2373; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2374; SSSE3-NEXT: pand %xmm4, %xmm0 2375; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2376; SSSE3-NEXT: por %xmm0, %xmm1 2377; SSSE3-NEXT: pand %xmm3, %xmm1 2378; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2379; SSSE3-NEXT: movd %xmm1, %eax 2380; SSSE3-NEXT: movw %ax, (%rdi) 2381; SSSE3-NEXT: retq 2382; 2383; SSE41-LABEL: trunc_packus_v2i64_v2i8_store: 2384; SSE41: # %bb.0: 2385; SSE41-NEXT: movdqa %xmm0, %xmm1 2386; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] 2387; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 2388; SSE41-NEXT: pxor %xmm3, %xmm0 2389; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] 2390; SSE41-NEXT: movdqa %xmm0, %xmm5 2391; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 2392; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 2393; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2394; SSE41-NEXT: pand %xmm5, %xmm0 2395; SSE41-NEXT: por %xmm4, %xmm0 2396; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 2397; SSE41-NEXT: xorpd %xmm1, %xmm1 2398; SSE41-NEXT: movapd %xmm2, %xmm4 2399; SSE41-NEXT: xorpd %xmm3, %xmm4 2400; SSE41-NEXT: movapd %xmm4, %xmm5 2401; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 2402; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2403; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2404; SSE41-NEXT: pand %xmm5, %xmm0 2405; SSE41-NEXT: por %xmm4, %xmm0 2406; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 2407; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2408; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) 2409; SSE41-NEXT: retq 2410; 2411; AVX-LABEL: trunc_packus_v2i64_v2i8_store: 2412; AVX: # %bb.0: 2413; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] 2414; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2415; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 2416; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2417; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 2418; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 2419; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2420; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) 2421; AVX-NEXT: retq 2422; 2423; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: 2424; AVX512F: # %bb.0: 2425; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2426; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2427; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2428; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2429; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) 2430; AVX512F-NEXT: vzeroupper 2431; AVX512F-NEXT: retq 2432; 2433; AVX512VL-LABEL: trunc_packus_v2i64_v2i8_store: 2434; AVX512VL: # %bb.0: 2435; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2436; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2437; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) 2438; AVX512VL-NEXT: retq 2439; 2440; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store: 2441; AVX512BW: # %bb.0: 2442; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2443; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2444; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2445; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2446; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdi) 2447; AVX512BW-NEXT: vzeroupper 2448; AVX512BW-NEXT: retq 2449; 2450; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8_store: 2451; AVX512BWVL: # %bb.0: 2452; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2453; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2454; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) 2455; AVX512BWVL-NEXT: retq 2456; 2457; SKX-LABEL: trunc_packus_v2i64_v2i8_store: 2458; SKX: # %bb.0: 2459; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2460; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2461; SKX-NEXT: vpmovusqb %xmm0, (%rdi) 2462; SKX-NEXT: retq 2463 %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255> 2464 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255> 2465 %3 = icmp sgt <2 x i64> %2, zeroinitializer 2466 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 2467 %5 = trunc <2 x i64> %4 to <2 x i8> 2468 store <2 x i8> %5, ptr%p1 2469 ret void 2470} 2471 2472define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { 2473; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i8: 2474; SSE2-SSSE3: # %bb.0: 2475; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255] 2476; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 2477; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 2478; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 2479; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 2480; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 2481; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 2482; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] 2483; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 2484; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 2485; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] 2486; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 2487; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] 2488; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 2489; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 2490; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 2491; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 2492; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 2493; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 2494; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 2495; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 2496; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 2497; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] 2498; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 2499; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 2500; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 2501; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 2502; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 2503; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 2504; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 2505; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 2506; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 2507; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 2508; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 2509; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 2510; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2511; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 2512; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2513; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 2514; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 2515; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 2516; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 2517; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 2518; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 2519; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 2520; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 2521; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2522; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 2523; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 2524; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 2525; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 2526; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 2527; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 2528; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 2529; SSE2-SSSE3-NEXT: retq 2530; 2531; SSE41-LABEL: trunc_packus_v4i64_v4i8: 2532; SSE41: # %bb.0: 2533; SSE41-NEXT: movdqa %xmm0, %xmm2 2534; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 2535; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 2536; SSE41-NEXT: movdqa %xmm0, %xmm5 2537; SSE41-NEXT: pxor %xmm3, %xmm5 2538; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] 2539; SSE41-NEXT: movdqa %xmm6, %xmm7 2540; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 2541; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 2542; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 2543; SSE41-NEXT: pand %xmm5, %xmm0 2544; SSE41-NEXT: por %xmm7, %xmm0 2545; SSE41-NEXT: movapd %xmm4, %xmm5 2546; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 2547; SSE41-NEXT: movdqa %xmm1, %xmm0 2548; SSE41-NEXT: pxor %xmm3, %xmm0 2549; SSE41-NEXT: movdqa %xmm0, %xmm2 2550; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 2551; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 2552; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 2553; SSE41-NEXT: pand %xmm2, %xmm0 2554; SSE41-NEXT: por %xmm6, %xmm0 2555; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 2556; SSE41-NEXT: xorpd %xmm1, %xmm1 2557; SSE41-NEXT: movapd %xmm4, %xmm2 2558; SSE41-NEXT: xorpd %xmm3, %xmm2 2559; SSE41-NEXT: movapd %xmm2, %xmm6 2560; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 2561; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 2562; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 2563; SSE41-NEXT: pand %xmm6, %xmm0 2564; SSE41-NEXT: por %xmm2, %xmm0 2565; SSE41-NEXT: pxor %xmm2, %xmm2 2566; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 2567; SSE41-NEXT: movapd %xmm5, %xmm4 2568; SSE41-NEXT: xorpd %xmm3, %xmm4 2569; SSE41-NEXT: movapd %xmm4, %xmm6 2570; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 2571; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2572; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2573; SSE41-NEXT: pand %xmm6, %xmm0 2574; SSE41-NEXT: por %xmm4, %xmm0 2575; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 2576; SSE41-NEXT: packusdw %xmm2, %xmm1 2577; SSE41-NEXT: packusdw %xmm1, %xmm1 2578; SSE41-NEXT: packuswb %xmm1, %xmm1 2579; SSE41-NEXT: movdqa %xmm1, %xmm0 2580; SSE41-NEXT: retq 2581; 2582; AVX1-LABEL: trunc_packus_v4i64_v4i8: 2583; AVX1: # %bb.0: 2584; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] 2585; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2586; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 2587; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2588; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 2589; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 2590; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2591; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 2592; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 2593; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 2594; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2595; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 2596; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2597; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 2598; AVX1-NEXT: vzeroupper 2599; AVX1-NEXT: retq 2600; 2601; AVX2-LABEL: trunc_packus_v4i64_v4i8: 2602; AVX2: # %bb.0: 2603; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] 2604; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 2605; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 2606; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2607; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 2608; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 2609; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2610; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2611; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2612; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 2613; AVX2-NEXT: vzeroupper 2614; AVX2-NEXT: retq 2615; 2616; AVX512F-LABEL: trunc_packus_v4i64_v4i8: 2617; AVX512F: # %bb.0: 2618; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2619; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2620; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2621; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2622; AVX512F-NEXT: vzeroupper 2623; AVX512F-NEXT: retq 2624; 2625; AVX512VL-LABEL: trunc_packus_v4i64_v4i8: 2626; AVX512VL: # %bb.0: 2627; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2628; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2629; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 2630; AVX512VL-NEXT: vzeroupper 2631; AVX512VL-NEXT: retq 2632; 2633; AVX512BW-LABEL: trunc_packus_v4i64_v4i8: 2634; AVX512BW: # %bb.0: 2635; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2636; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2637; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2638; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2639; AVX512BW-NEXT: vzeroupper 2640; AVX512BW-NEXT: retq 2641; 2642; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8: 2643; AVX512BWVL: # %bb.0: 2644; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2645; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2646; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 2647; AVX512BWVL-NEXT: vzeroupper 2648; AVX512BWVL-NEXT: retq 2649; 2650; SKX-LABEL: trunc_packus_v4i64_v4i8: 2651; SKX: # %bb.0: 2652; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2653; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2654; SKX-NEXT: vpmovusqb %ymm0, %xmm0 2655; SKX-NEXT: vzeroupper 2656; SKX-NEXT: retq 2657 %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255> 2658 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255> 2659 %3 = icmp sgt <4 x i64> %2, zeroinitializer 2660 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 2661 %5 = trunc <4 x i64> %4 to <4 x i8> 2662 ret <4 x i8> %5 2663} 2664 2665define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { 2666; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i8_store: 2667; SSE2-SSSE3: # %bb.0: 2668; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] 2669; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 2670; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 2671; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 2672; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2673; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 2674; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 2675; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] 2676; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 2677; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 2678; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] 2679; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 2680; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] 2681; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 2682; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 2683; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 2684; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 2685; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 2686; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 2687; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2688; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 2689; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 2690; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 2691; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 2692; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 2693; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 2694; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 2695; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 2696; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 2697; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 2698; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 2699; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 2700; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 2701; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] 2702; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 2703; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2704; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 2705; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2706; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 2707; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 2708; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 2709; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 2710; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 2711; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 2712; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 2713; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 2714; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2715; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 2716; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 2717; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 2718; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 2719; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm2 2720; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2 2721; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2 2722; SSE2-SSSE3-NEXT: movd %xmm2, (%rdi) 2723; SSE2-SSSE3-NEXT: retq 2724; 2725; SSE41-LABEL: trunc_packus_v4i64_v4i8_store: 2726; SSE41: # %bb.0: 2727; SSE41-NEXT: movdqa %xmm0, %xmm2 2728; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 2729; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] 2730; SSE41-NEXT: movdqa %xmm0, %xmm5 2731; SSE41-NEXT: pxor %xmm3, %xmm5 2732; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] 2733; SSE41-NEXT: movdqa %xmm6, %xmm7 2734; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 2735; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 2736; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 2737; SSE41-NEXT: pand %xmm5, %xmm0 2738; SSE41-NEXT: por %xmm7, %xmm0 2739; SSE41-NEXT: movapd %xmm4, %xmm5 2740; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 2741; SSE41-NEXT: movdqa %xmm1, %xmm0 2742; SSE41-NEXT: pxor %xmm3, %xmm0 2743; SSE41-NEXT: movdqa %xmm0, %xmm2 2744; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 2745; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 2746; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 2747; SSE41-NEXT: pand %xmm2, %xmm0 2748; SSE41-NEXT: por %xmm6, %xmm0 2749; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 2750; SSE41-NEXT: xorpd %xmm1, %xmm1 2751; SSE41-NEXT: movapd %xmm4, %xmm2 2752; SSE41-NEXT: xorpd %xmm3, %xmm2 2753; SSE41-NEXT: movapd %xmm2, %xmm6 2754; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 2755; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 2756; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 2757; SSE41-NEXT: pand %xmm6, %xmm0 2758; SSE41-NEXT: por %xmm2, %xmm0 2759; SSE41-NEXT: pxor %xmm2, %xmm2 2760; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 2761; SSE41-NEXT: movapd %xmm5, %xmm4 2762; SSE41-NEXT: xorpd %xmm3, %xmm4 2763; SSE41-NEXT: movapd %xmm4, %xmm6 2764; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 2765; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2766; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2767; SSE41-NEXT: pand %xmm6, %xmm0 2768; SSE41-NEXT: por %xmm4, %xmm0 2769; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 2770; SSE41-NEXT: packusdw %xmm2, %xmm1 2771; SSE41-NEXT: packusdw %xmm1, %xmm1 2772; SSE41-NEXT: packuswb %xmm1, %xmm1 2773; SSE41-NEXT: movd %xmm1, (%rdi) 2774; SSE41-NEXT: retq 2775; 2776; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: 2777; AVX1: # %bb.0: 2778; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] 2779; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2780; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 2781; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2782; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 2783; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 2784; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2785; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 2786; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 2787; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 2788; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2789; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 2790; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2791; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 2792; AVX1-NEXT: vmovd %xmm0, (%rdi) 2793; AVX1-NEXT: vzeroupper 2794; AVX1-NEXT: retq 2795; 2796; AVX2-LABEL: trunc_packus_v4i64_v4i8_store: 2797; AVX2: # %bb.0: 2798; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] 2799; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 2800; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 2801; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2802; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 2803; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 2804; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2805; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2806; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2807; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 2808; AVX2-NEXT: vmovd %xmm0, (%rdi) 2809; AVX2-NEXT: vzeroupper 2810; AVX2-NEXT: retq 2811; 2812; AVX512F-LABEL: trunc_packus_v4i64_v4i8_store: 2813; AVX512F: # %bb.0: 2814; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2815; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2816; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2817; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2818; AVX512F-NEXT: vmovd %xmm0, (%rdi) 2819; AVX512F-NEXT: vzeroupper 2820; AVX512F-NEXT: retq 2821; 2822; AVX512VL-LABEL: trunc_packus_v4i64_v4i8_store: 2823; AVX512VL: # %bb.0: 2824; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2825; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2826; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) 2827; AVX512VL-NEXT: vzeroupper 2828; AVX512VL-NEXT: retq 2829; 2830; AVX512BW-LABEL: trunc_packus_v4i64_v4i8_store: 2831; AVX512BW: # %bb.0: 2832; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2833; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2834; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2835; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2836; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 2837; AVX512BW-NEXT: vzeroupper 2838; AVX512BW-NEXT: retq 2839; 2840; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8_store: 2841; AVX512BWVL: # %bb.0: 2842; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2843; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2844; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) 2845; AVX512BWVL-NEXT: vzeroupper 2846; AVX512BWVL-NEXT: retq 2847; 2848; SKX-LABEL: trunc_packus_v4i64_v4i8_store: 2849; SKX: # %bb.0: 2850; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2851; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 2852; SKX-NEXT: vpmovusqb %ymm0, (%rdi) 2853; SKX-NEXT: vzeroupper 2854; SKX-NEXT: retq 2855 %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255> 2856 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255> 2857 %3 = icmp sgt <4 x i64> %2, zeroinitializer 2858 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 2859 %5 = trunc <4 x i64> %4 to <4 x i8> 2860 store <4 x i8> %5, ptr%p1 2861 ret void 2862} 2863 2864define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { 2865; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i8: 2866; SSE2-SSSE3: # %bb.0: 2867; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 2868; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 2869; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 2870; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 2871; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] 2872; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2873; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 2874; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 2875; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] 2876; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 2877; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 2878; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 2879; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 2880; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 2881; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 2882; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 2883; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] 2884; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 2885; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 2886; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 2887; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 2888; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 2889; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 2890; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] 2891; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 2892; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 2893; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 2894; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 2895; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 2896; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] 2897; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 2898; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 2899; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 2900; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 2901; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 2902; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 2903; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 2904; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 2905; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 2906; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 2907; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 2908; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 2909; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] 2910; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 2911; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 2912; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 2913; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 2914; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 2915; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 2916; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 2917; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 2918; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 2919; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] 2920; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 2921; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 2922; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 2923; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 2924; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 2925; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 2926; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 2927; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 2928; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 2929; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 2930; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 2931; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 2932; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2933; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 2934; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2935; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 2936; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 2937; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 2938; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 2939; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 2940; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 2941; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 2942; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 2943; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] 2944; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 2945; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] 2946; SSE2-SSSE3-NEXT: por %xmm7, %xmm0 2947; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 2948; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm0 2949; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 2950; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 2951; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 2952; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 2953; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 2954; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 2955; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2956; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 2957; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 2958; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 2959; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 2960; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 2961; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 2962; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 2963; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 2964; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 2965; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 2966; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 2967; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 2968; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 2969; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 2970; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 2971; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm3 2972; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 2973; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 2974; SSE2-SSSE3-NEXT: retq 2975; 2976; SSE41-LABEL: trunc_packus_v8i64_v8i8: 2977; SSE41: # %bb.0: 2978; SSE41-NEXT: movdqa (%rdi), %xmm7 2979; SSE41-NEXT: movdqa 16(%rdi), %xmm5 2980; SSE41-NEXT: movdqa 32(%rdi), %xmm4 2981; SSE41-NEXT: movdqa 48(%rdi), %xmm8 2982; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] 2983; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 2984; SSE41-NEXT: movdqa %xmm4, %xmm3 2985; SSE41-NEXT: pxor %xmm2, %xmm3 2986; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] 2987; SSE41-NEXT: movdqa %xmm6, %xmm9 2988; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 2989; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 2990; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 2991; SSE41-NEXT: pand %xmm3, %xmm0 2992; SSE41-NEXT: por %xmm9, %xmm0 2993; SSE41-NEXT: movapd %xmm1, %xmm3 2994; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 2995; SSE41-NEXT: movdqa %xmm8, %xmm4 2996; SSE41-NEXT: pxor %xmm2, %xmm4 2997; SSE41-NEXT: movdqa %xmm6, %xmm9 2998; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 2999; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 3000; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3001; SSE41-NEXT: pand %xmm4, %xmm0 3002; SSE41-NEXT: por %xmm9, %xmm0 3003; SSE41-NEXT: movapd %xmm1, %xmm4 3004; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 3005; SSE41-NEXT: movdqa %xmm7, %xmm8 3006; SSE41-NEXT: pxor %xmm2, %xmm8 3007; SSE41-NEXT: movdqa %xmm6, %xmm9 3008; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 3009; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 3010; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3011; SSE41-NEXT: pand %xmm8, %xmm0 3012; SSE41-NEXT: por %xmm9, %xmm0 3013; SSE41-NEXT: movapd %xmm1, %xmm8 3014; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 3015; SSE41-NEXT: movdqa %xmm5, %xmm0 3016; SSE41-NEXT: pxor %xmm2, %xmm0 3017; SSE41-NEXT: movdqa %xmm0, %xmm7 3018; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 3019; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3020; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3021; SSE41-NEXT: pand %xmm7, %xmm0 3022; SSE41-NEXT: por %xmm6, %xmm0 3023; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 3024; SSE41-NEXT: xorpd %xmm5, %xmm5 3025; SSE41-NEXT: movapd %xmm1, %xmm6 3026; SSE41-NEXT: xorpd %xmm2, %xmm6 3027; SSE41-NEXT: movapd %xmm6, %xmm7 3028; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3029; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 3030; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3031; SSE41-NEXT: pand %xmm7, %xmm0 3032; SSE41-NEXT: por %xmm6, %xmm0 3033; SSE41-NEXT: pxor %xmm6, %xmm6 3034; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 3035; SSE41-NEXT: movapd %xmm8, %xmm1 3036; SSE41-NEXT: xorpd %xmm2, %xmm1 3037; SSE41-NEXT: movapd %xmm1, %xmm7 3038; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3039; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 3040; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 3041; SSE41-NEXT: pand %xmm7, %xmm0 3042; SSE41-NEXT: por %xmm1, %xmm0 3043; SSE41-NEXT: pxor %xmm1, %xmm1 3044; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 3045; SSE41-NEXT: packusdw %xmm6, %xmm1 3046; SSE41-NEXT: movapd %xmm4, %xmm6 3047; SSE41-NEXT: xorpd %xmm2, %xmm6 3048; SSE41-NEXT: movapd %xmm6, %xmm7 3049; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3050; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 3051; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3052; SSE41-NEXT: pand %xmm7, %xmm0 3053; SSE41-NEXT: por %xmm6, %xmm0 3054; SSE41-NEXT: pxor %xmm6, %xmm6 3055; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 3056; SSE41-NEXT: movapd %xmm3, %xmm4 3057; SSE41-NEXT: xorpd %xmm2, %xmm4 3058; SSE41-NEXT: movapd %xmm4, %xmm7 3059; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3060; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 3061; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3062; SSE41-NEXT: pand %xmm7, %xmm0 3063; SSE41-NEXT: por %xmm4, %xmm0 3064; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 3065; SSE41-NEXT: packusdw %xmm6, %xmm5 3066; SSE41-NEXT: packusdw %xmm5, %xmm1 3067; SSE41-NEXT: packuswb %xmm1, %xmm1 3068; SSE41-NEXT: movdqa %xmm1, %xmm0 3069; SSE41-NEXT: retq 3070; 3071; AVX1-LABEL: trunc_packus_v8i64_v8i8: 3072; AVX1: # %bb.0: 3073; AVX1-NEXT: vmovdqa (%rdi), %xmm0 3074; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 3075; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 3076; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 3077; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] 3078; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 3079; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 3080; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 3081; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 3082; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 3083; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 3084; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 3085; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 3086; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 3087; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 3088; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 3089; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 3090; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 3091; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3092; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 3093; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 3094; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 3095; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 3096; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 3097; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3098; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3099; AVX1-NEXT: retq 3100; 3101; AVX2-LABEL: trunc_packus_v8i64_v8i8: 3102; AVX2: # %bb.0: 3103; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3104; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3105; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 3106; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 3107; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 3108; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 3109; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 3110; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3111; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 3112; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 3113; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 3114; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 3115; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3116; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3117; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3118; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3119; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3120; AVX2-NEXT: vzeroupper 3121; AVX2-NEXT: retq 3122; 3123; AVX512-LABEL: trunc_packus_v8i64_v8i8: 3124; AVX512: # %bb.0: 3125; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 3126; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 3127; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 3128; AVX512-NEXT: vzeroupper 3129; AVX512-NEXT: retq 3130; 3131; SKX-LABEL: trunc_packus_v8i64_v8i8: 3132; SKX: # %bb.0: 3133; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3134; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 3135; SKX-NEXT: vpmovusqb %ymm1, %xmm1 3136; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 3137; SKX-NEXT: vpmovusqb %ymm0, %xmm0 3138; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3139; SKX-NEXT: vzeroupper 3140; SKX-NEXT: retq 3141 %a0 = load <8 x i64>, ptr %p0 3142 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3143 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3144 %3 = icmp sgt <8 x i64> %2, zeroinitializer 3145 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 3146 %5 = trunc <8 x i64> %4 to <8 x i8> 3147 ret <8 x i8> %5 3148} 3149 3150define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" { 3151; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: 3152; SSE2-SSSE3: # %bb.0: 3153; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 3154; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 3155; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 3156; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 3157; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] 3158; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] 3159; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 3160; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 3161; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] 3162; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 3163; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 3164; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 3165; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 3166; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 3167; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 3168; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 3169; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] 3170; SSE2-SSSE3-NEXT: por %xmm11, %xmm1 3171; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 3172; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 3173; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 3174; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm2 3175; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 3176; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] 3177; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 3178; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 3179; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 3180; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 3181; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 3182; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] 3183; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 3184; SSE2-SSSE3-NEXT: pand %xmm2, %xmm8 3185; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 3186; SSE2-SSSE3-NEXT: por %xmm8, %xmm2 3187; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 3188; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 3189; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 3190; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 3191; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 3192; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 3193; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] 3194; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 3195; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] 3196; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 3197; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 3198; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 3199; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 3200; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 3201; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 3202; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 3203; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 3204; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 3205; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] 3206; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 3207; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3208; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 3209; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 3210; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 3211; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 3212; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm3 3213; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 3214; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 3215; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 3216; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3217; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 3218; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3219; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 3220; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3221; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 3222; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 3223; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 3224; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 3225; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 3226; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 3227; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 3228; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 3229; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] 3230; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 3231; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 3232; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 3233; SSE2-SSSE3-NEXT: pand %xmm8, %xmm3 3234; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm3 3235; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 3236; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 3237; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 3238; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 3239; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 3240; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 3241; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3242; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 3243; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 3244; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 3245; SSE2-SSSE3-NEXT: pand %xmm2, %xmm5 3246; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 3247; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 3248; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 3249; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 3250; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 3251; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 3252; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 3253; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 3254; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 3255; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 3256; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 3257; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm2 3258; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm3 3259; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm3 3260; SSE2-SSSE3-NEXT: movq %xmm3, (%rsi) 3261; SSE2-SSSE3-NEXT: retq 3262; 3263; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: 3264; SSE41: # %bb.0: 3265; SSE41-NEXT: movdqa (%rdi), %xmm7 3266; SSE41-NEXT: movdqa 16(%rdi), %xmm5 3267; SSE41-NEXT: movdqa 32(%rdi), %xmm3 3268; SSE41-NEXT: movdqa 48(%rdi), %xmm8 3269; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 3270; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] 3271; SSE41-NEXT: movdqa %xmm3, %xmm2 3272; SSE41-NEXT: pxor %xmm1, %xmm2 3273; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] 3274; SSE41-NEXT: movdqa %xmm6, %xmm9 3275; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 3276; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 3277; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3278; SSE41-NEXT: pand %xmm2, %xmm0 3279; SSE41-NEXT: por %xmm9, %xmm0 3280; SSE41-NEXT: movapd %xmm4, %xmm2 3281; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 3282; SSE41-NEXT: movdqa %xmm8, %xmm3 3283; SSE41-NEXT: pxor %xmm1, %xmm3 3284; SSE41-NEXT: movdqa %xmm6, %xmm9 3285; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 3286; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 3287; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3288; SSE41-NEXT: pand %xmm3, %xmm0 3289; SSE41-NEXT: por %xmm9, %xmm0 3290; SSE41-NEXT: movapd %xmm4, %xmm3 3291; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 3292; SSE41-NEXT: movdqa %xmm7, %xmm8 3293; SSE41-NEXT: pxor %xmm1, %xmm8 3294; SSE41-NEXT: movdqa %xmm6, %xmm9 3295; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 3296; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 3297; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3298; SSE41-NEXT: pand %xmm8, %xmm0 3299; SSE41-NEXT: por %xmm9, %xmm0 3300; SSE41-NEXT: movapd %xmm4, %xmm8 3301; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 3302; SSE41-NEXT: movdqa %xmm5, %xmm0 3303; SSE41-NEXT: pxor %xmm1, %xmm0 3304; SSE41-NEXT: movdqa %xmm0, %xmm7 3305; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 3306; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3307; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3308; SSE41-NEXT: pand %xmm7, %xmm0 3309; SSE41-NEXT: por %xmm6, %xmm0 3310; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 3311; SSE41-NEXT: xorpd %xmm5, %xmm5 3312; SSE41-NEXT: movapd %xmm4, %xmm6 3313; SSE41-NEXT: xorpd %xmm1, %xmm6 3314; SSE41-NEXT: movapd %xmm6, %xmm7 3315; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 3316; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 3317; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3318; SSE41-NEXT: pand %xmm7, %xmm0 3319; SSE41-NEXT: por %xmm6, %xmm0 3320; SSE41-NEXT: pxor %xmm6, %xmm6 3321; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 3322; SSE41-NEXT: movapd %xmm8, %xmm4 3323; SSE41-NEXT: xorpd %xmm1, %xmm4 3324; SSE41-NEXT: movapd %xmm4, %xmm7 3325; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 3326; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 3327; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3328; SSE41-NEXT: pand %xmm7, %xmm0 3329; SSE41-NEXT: por %xmm4, %xmm0 3330; SSE41-NEXT: pxor %xmm4, %xmm4 3331; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 3332; SSE41-NEXT: packusdw %xmm6, %xmm4 3333; SSE41-NEXT: movapd %xmm3, %xmm6 3334; SSE41-NEXT: xorpd %xmm1, %xmm6 3335; SSE41-NEXT: movapd %xmm6, %xmm7 3336; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 3337; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 3338; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3339; SSE41-NEXT: pand %xmm7, %xmm0 3340; SSE41-NEXT: por %xmm6, %xmm0 3341; SSE41-NEXT: pxor %xmm6, %xmm6 3342; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 3343; SSE41-NEXT: movapd %xmm2, %xmm3 3344; SSE41-NEXT: xorpd %xmm1, %xmm3 3345; SSE41-NEXT: movapd %xmm3, %xmm7 3346; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 3347; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 3348; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 3349; SSE41-NEXT: pand %xmm7, %xmm0 3350; SSE41-NEXT: por %xmm3, %xmm0 3351; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 3352; SSE41-NEXT: packusdw %xmm6, %xmm5 3353; SSE41-NEXT: packusdw %xmm5, %xmm4 3354; SSE41-NEXT: packuswb %xmm4, %xmm4 3355; SSE41-NEXT: movq %xmm4, (%rsi) 3356; SSE41-NEXT: retq 3357; 3358; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: 3359; AVX1: # %bb.0: 3360; AVX1-NEXT: vmovdqa (%rdi), %xmm0 3361; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 3362; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 3363; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 3364; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] 3365; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 3366; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 3367; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 3368; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 3369; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 3370; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 3371; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 3372; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 3373; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 3374; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 3375; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 3376; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 3377; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 3378; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3379; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 3380; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 3381; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 3382; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 3383; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 3384; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3385; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3386; AVX1-NEXT: vmovq %xmm0, (%rsi) 3387; AVX1-NEXT: retq 3388; 3389; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: 3390; AVX2: # %bb.0: 3391; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3392; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3393; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 3394; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 3395; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 3396; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 3397; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 3398; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3399; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 3400; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 3401; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 3402; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 3403; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3404; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3405; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3406; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3407; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3408; AVX2-NEXT: vmovq %xmm0, (%rsi) 3409; AVX2-NEXT: vzeroupper 3410; AVX2-NEXT: retq 3411; 3412; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: 3413; AVX512: # %bb.0: 3414; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 3415; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 3416; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) 3417; AVX512-NEXT: vzeroupper 3418; AVX512-NEXT: retq 3419; 3420; SKX-LABEL: trunc_packus_v8i64_v8i8_store: 3421; SKX: # %bb.0: 3422; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3423; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 3424; SKX-NEXT: vpmovusqb %ymm1, %xmm1 3425; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 3426; SKX-NEXT: vpmovusqb %ymm0, %xmm0 3427; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3428; SKX-NEXT: vmovq %xmm0, (%rsi) 3429; SKX-NEXT: vzeroupper 3430; SKX-NEXT: retq 3431 %a0 = load <8 x i64>, ptr %p0 3432 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3433 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3434 %3 = icmp sgt <8 x i64> %2, zeroinitializer 3435 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 3436 %5 = trunc <8 x i64> %4 to <8 x i8> 3437 store <8 x i8> %5, ptr%p1 3438 ret void 3439} 3440 3441define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { 3442; SSE2-SSSE3-LABEL: trunc_packus_v16i64_v16i8: 3443; SSE2-SSSE3: # %bb.0: 3444; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm7 3445; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 3446; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm12 3447; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm11 3448; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm10 3449; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm5 3450; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 3451; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm3 3452; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [255,255] 3453; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 3454; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 3455; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 3456; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] 3457; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm9 3458; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3459; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] 3460; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3461; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 3462; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3463; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3464; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] 3465; SSE2-SSSE3-NEXT: por %xmm15, %xmm2 3466; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 3467; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm2 3468; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 3469; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3 3470; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 3471; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] 3472; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3473; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3474; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 3475; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3476; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3477; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] 3478; SSE2-SSSE3-NEXT: por %xmm15, %xmm3 3479; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 3480; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm3 3481; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 3482; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 3483; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 3484; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] 3485; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3486; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3487; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 3488; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3489; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3490; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] 3491; SSE2-SSSE3-NEXT: por %xmm15, %xmm4 3492; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 3493; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 3494; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 3495; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm5 3496; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 3497; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] 3498; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3499; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3500; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 3501; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3502; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3503; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] 3504; SSE2-SSSE3-NEXT: por %xmm15, %xmm5 3505; SSE2-SSSE3-NEXT: pand %xmm5, %xmm10 3506; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 3507; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 3508; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm10 3509; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 3510; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] 3511; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3512; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3513; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm14 3514; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3515; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3516; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] 3517; SSE2-SSSE3-NEXT: por %xmm15, %xmm10 3518; SSE2-SSSE3-NEXT: pand %xmm10, %xmm12 3519; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 3520; SSE2-SSSE3-NEXT: por %xmm12, %xmm10 3521; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm12 3522; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm12 3523; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] 3524; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3525; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3526; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 3527; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3528; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3529; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] 3530; SSE2-SSSE3-NEXT: por %xmm15, %xmm12 3531; SSE2-SSSE3-NEXT: pand %xmm12, %xmm11 3532; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm12 3533; SSE2-SSSE3-NEXT: por %xmm11, %xmm12 3534; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm11 3535; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 3536; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] 3537; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3538; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 3539; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 3540; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] 3541; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 3542; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] 3543; SSE2-SSSE3-NEXT: por %xmm15, %xmm11 3544; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 3545; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 3546; SSE2-SSSE3-NEXT: por %xmm7, %xmm11 3547; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 3548; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 3549; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] 3550; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 3551; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 3552; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] 3553; SSE2-SSSE3-NEXT: pand %xmm13, %xmm7 3554; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 3555; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 3556; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 3557; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm8 3558; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 3559; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 3560; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 3561; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 3562; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 3563; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] 3564; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 3565; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3566; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 3567; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3568; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 3569; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 3570; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm0 3571; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 3572; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 3573; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 3574; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] 3575; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 3576; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] 3577; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 3578; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] 3579; SSE2-SSSE3-NEXT: por %xmm9, %xmm0 3580; SSE2-SSSE3-NEXT: pand %xmm11, %xmm0 3581; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm0 3582; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm6 3583; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 3584; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 3585; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 3586; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] 3587; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 3588; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3589; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 3590; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 3591; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 3592; SSE2-SSSE3-NEXT: pand %xmm12, %xmm7 3593; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm6 3594; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 3595; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 3596; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 3597; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] 3598; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 3599; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3600; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 3601; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 3602; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 3603; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 3604; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm8 3605; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm0 3606; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 3607; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 3608; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 3609; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 3610; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] 3611; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 3612; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3613; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 3614; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 3615; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 3616; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 3617; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 3618; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 3619; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 3620; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 3621; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] 3622; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 3623; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 3624; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 3625; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 3626; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 3627; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 3628; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm5 3629; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 3630; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 3631; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 3632; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 3633; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] 3634; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 3635; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3636; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 3637; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 3638; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 3639; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 3640; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 3641; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 3642; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 3643; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 3644; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] 3645; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 3646; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 3647; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 3648; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 3649; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 3650; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 3651; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm3 3652; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm5 3653; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm0 3654; SSE2-SSSE3-NEXT: retq 3655; 3656; SSE41-LABEL: trunc_packus_v16i64_v16i8: 3657; SSE41: # %bb.0: 3658; SSE41-NEXT: movdqa (%rdi), %xmm8 3659; SSE41-NEXT: movdqa 16(%rdi), %xmm7 3660; SSE41-NEXT: movdqa 32(%rdi), %xmm12 3661; SSE41-NEXT: movdqa 48(%rdi), %xmm11 3662; SSE41-NEXT: movdqa 80(%rdi), %xmm10 3663; SSE41-NEXT: movdqa 64(%rdi), %xmm6 3664; SSE41-NEXT: movdqa 112(%rdi), %xmm5 3665; SSE41-NEXT: movdqa 96(%rdi), %xmm4 3666; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] 3667; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 3668; SSE41-NEXT: movdqa %xmm4, %xmm3 3669; SSE41-NEXT: pxor %xmm2, %xmm3 3670; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] 3671; SSE41-NEXT: movdqa %xmm9, %xmm13 3672; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 3673; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 3674; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3675; SSE41-NEXT: pand %xmm3, %xmm0 3676; SSE41-NEXT: por %xmm13, %xmm0 3677; SSE41-NEXT: movapd %xmm1, %xmm3 3678; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 3679; SSE41-NEXT: movdqa %xmm5, %xmm4 3680; SSE41-NEXT: pxor %xmm2, %xmm4 3681; SSE41-NEXT: movdqa %xmm9, %xmm13 3682; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 3683; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 3684; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3685; SSE41-NEXT: pand %xmm4, %xmm0 3686; SSE41-NEXT: por %xmm13, %xmm0 3687; SSE41-NEXT: movapd %xmm1, %xmm4 3688; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 3689; SSE41-NEXT: movdqa %xmm6, %xmm5 3690; SSE41-NEXT: pxor %xmm2, %xmm5 3691; SSE41-NEXT: movdqa %xmm9, %xmm13 3692; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 3693; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 3694; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3695; SSE41-NEXT: pand %xmm5, %xmm0 3696; SSE41-NEXT: por %xmm13, %xmm0 3697; SSE41-NEXT: movapd %xmm1, %xmm5 3698; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 3699; SSE41-NEXT: movdqa %xmm10, %xmm6 3700; SSE41-NEXT: pxor %xmm2, %xmm6 3701; SSE41-NEXT: movdqa %xmm9, %xmm13 3702; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 3703; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 3704; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3705; SSE41-NEXT: pand %xmm6, %xmm0 3706; SSE41-NEXT: por %xmm13, %xmm0 3707; SSE41-NEXT: movapd %xmm1, %xmm6 3708; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 3709; SSE41-NEXT: movdqa %xmm12, %xmm10 3710; SSE41-NEXT: pxor %xmm2, %xmm10 3711; SSE41-NEXT: movdqa %xmm9, %xmm13 3712; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 3713; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 3714; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3715; SSE41-NEXT: pand %xmm10, %xmm0 3716; SSE41-NEXT: por %xmm13, %xmm0 3717; SSE41-NEXT: movapd %xmm1, %xmm10 3718; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 3719; SSE41-NEXT: movdqa %xmm11, %xmm12 3720; SSE41-NEXT: pxor %xmm2, %xmm12 3721; SSE41-NEXT: movdqa %xmm9, %xmm13 3722; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 3723; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 3724; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3725; SSE41-NEXT: pand %xmm12, %xmm0 3726; SSE41-NEXT: por %xmm13, %xmm0 3727; SSE41-NEXT: movapd %xmm1, %xmm12 3728; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 3729; SSE41-NEXT: movdqa %xmm8, %xmm11 3730; SSE41-NEXT: pxor %xmm2, %xmm11 3731; SSE41-NEXT: movdqa %xmm9, %xmm13 3732; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 3733; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 3734; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] 3735; SSE41-NEXT: pand %xmm11, %xmm0 3736; SSE41-NEXT: por %xmm13, %xmm0 3737; SSE41-NEXT: movapd %xmm1, %xmm11 3738; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 3739; SSE41-NEXT: movdqa %xmm7, %xmm0 3740; SSE41-NEXT: pxor %xmm2, %xmm0 3741; SSE41-NEXT: movdqa %xmm0, %xmm8 3742; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 3743; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 3744; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3745; SSE41-NEXT: pand %xmm8, %xmm0 3746; SSE41-NEXT: por %xmm9, %xmm0 3747; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 3748; SSE41-NEXT: xorpd %xmm7, %xmm7 3749; SSE41-NEXT: movapd %xmm1, %xmm8 3750; SSE41-NEXT: xorpd %xmm2, %xmm8 3751; SSE41-NEXT: movapd %xmm8, %xmm9 3752; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 3753; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 3754; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] 3755; SSE41-NEXT: pand %xmm9, %xmm0 3756; SSE41-NEXT: por %xmm8, %xmm0 3757; SSE41-NEXT: pxor %xmm8, %xmm8 3758; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 3759; SSE41-NEXT: movapd %xmm11, %xmm1 3760; SSE41-NEXT: xorpd %xmm2, %xmm1 3761; SSE41-NEXT: movapd %xmm1, %xmm9 3762; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 3763; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 3764; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 3765; SSE41-NEXT: pand %xmm9, %xmm0 3766; SSE41-NEXT: por %xmm1, %xmm0 3767; SSE41-NEXT: pxor %xmm1, %xmm1 3768; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 3769; SSE41-NEXT: packusdw %xmm8, %xmm1 3770; SSE41-NEXT: movapd %xmm12, %xmm8 3771; SSE41-NEXT: xorpd %xmm2, %xmm8 3772; SSE41-NEXT: movapd %xmm8, %xmm9 3773; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 3774; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 3775; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] 3776; SSE41-NEXT: pand %xmm9, %xmm0 3777; SSE41-NEXT: por %xmm8, %xmm0 3778; SSE41-NEXT: pxor %xmm8, %xmm8 3779; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 3780; SSE41-NEXT: movapd %xmm10, %xmm9 3781; SSE41-NEXT: xorpd %xmm2, %xmm9 3782; SSE41-NEXT: movapd %xmm9, %xmm11 3783; SSE41-NEXT: pcmpeqd %xmm2, %xmm11 3784; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 3785; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 3786; SSE41-NEXT: pand %xmm11, %xmm0 3787; SSE41-NEXT: por %xmm9, %xmm0 3788; SSE41-NEXT: pxor %xmm9, %xmm9 3789; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm9 3790; SSE41-NEXT: packusdw %xmm8, %xmm9 3791; SSE41-NEXT: packusdw %xmm9, %xmm1 3792; SSE41-NEXT: movapd %xmm6, %xmm8 3793; SSE41-NEXT: xorpd %xmm2, %xmm8 3794; SSE41-NEXT: movapd %xmm8, %xmm9 3795; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 3796; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 3797; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] 3798; SSE41-NEXT: pand %xmm9, %xmm0 3799; SSE41-NEXT: por %xmm8, %xmm0 3800; SSE41-NEXT: pxor %xmm8, %xmm8 3801; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 3802; SSE41-NEXT: movapd %xmm5, %xmm6 3803; SSE41-NEXT: xorpd %xmm2, %xmm6 3804; SSE41-NEXT: movapd %xmm6, %xmm9 3805; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 3806; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 3807; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3808; SSE41-NEXT: pand %xmm9, %xmm0 3809; SSE41-NEXT: por %xmm6, %xmm0 3810; SSE41-NEXT: pxor %xmm6, %xmm6 3811; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 3812; SSE41-NEXT: packusdw %xmm8, %xmm6 3813; SSE41-NEXT: movapd %xmm4, %xmm5 3814; SSE41-NEXT: xorpd %xmm2, %xmm5 3815; SSE41-NEXT: movapd %xmm5, %xmm8 3816; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 3817; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 3818; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 3819; SSE41-NEXT: pand %xmm8, %xmm0 3820; SSE41-NEXT: por %xmm5, %xmm0 3821; SSE41-NEXT: pxor %xmm5, %xmm5 3822; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 3823; SSE41-NEXT: movapd %xmm3, %xmm4 3824; SSE41-NEXT: xorpd %xmm2, %xmm4 3825; SSE41-NEXT: movapd %xmm4, %xmm8 3826; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 3827; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 3828; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3829; SSE41-NEXT: pand %xmm8, %xmm0 3830; SSE41-NEXT: por %xmm4, %xmm0 3831; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 3832; SSE41-NEXT: packusdw %xmm5, %xmm7 3833; SSE41-NEXT: packusdw %xmm7, %xmm6 3834; SSE41-NEXT: packuswb %xmm6, %xmm1 3835; SSE41-NEXT: movdqa %xmm1, %xmm0 3836; SSE41-NEXT: retq 3837; 3838; AVX1-LABEL: trunc_packus_v16i64_v16i8: 3839; AVX1: # %bb.0: 3840; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 3841; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] 3842; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 3843; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 3844; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 3845; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 3846; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 3847; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 3848; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 3849; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 3850; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 3851; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 3852; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 3853; AVX1-NEXT: vmovdqa (%rdi), %xmm5 3854; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 3855; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 3856; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 3857; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 3858; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 3859; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 3860; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 3861; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 3862; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 3863; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 3864; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 3865; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 3866; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 3867; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 3868; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 3869; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 3870; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 3871; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 3872; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 3873; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 3874; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 3875; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 3876; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3877; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 3878; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 3879; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 3880; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 3881; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 3882; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 3883; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 3884; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 3885; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 3886; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3887; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 3888; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 3889; AVX1-NEXT: retq 3890; 3891; AVX2-LABEL: trunc_packus_v16i64_v16i8: 3892; AVX2: # %bb.0: 3893; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3894; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3895; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 3896; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 3897; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3898; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 3899; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 3900; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 3901; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 3902; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 3903; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 3904; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 3905; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 3906; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 3907; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 3908; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 3909; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 3910; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 3911; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3912; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 3913; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3914; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 3915; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 3916; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 3917; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 3918; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3919; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3920; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3921; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3922; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3923; AVX2-NEXT: vzeroupper 3924; AVX2-NEXT: retq 3925; 3926; AVX512-LABEL: trunc_packus_v16i64_v16i8: 3927; AVX512: # %bb.0: 3928; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 3929; AVX512-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 3930; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 3931; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 3932; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 3933; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3934; AVX512-NEXT: vzeroupper 3935; AVX512-NEXT: retq 3936; 3937; SKX-LABEL: trunc_packus_v16i64_v16i8: 3938; SKX: # %bb.0: 3939; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3940; SKX-NEXT: vpmaxsq 96(%rdi), %ymm0, %ymm1 3941; SKX-NEXT: vpmovusqb %ymm1, %xmm1 3942; SKX-NEXT: vpmaxsq 64(%rdi), %ymm0, %ymm2 3943; SKX-NEXT: vpmovusqb %ymm2, %xmm2 3944; SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3945; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm2 3946; SKX-NEXT: vpmovusqb %ymm2, %xmm2 3947; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 3948; SKX-NEXT: vpmovusqb %ymm0, %xmm0 3949; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 3950; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3951; SKX-NEXT: vzeroupper 3952; SKX-NEXT: retq 3953 %a0 = load <16 x i64>, ptr %p0 3954 %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3955 %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3956 %3 = icmp sgt <16 x i64> %2, zeroinitializer 3957 %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer 3958 %5 = trunc <16 x i64> %4 to <16 x i8> 3959 ret <16 x i8> %5 3960} 3961 3962define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" { 3963; SSE-LABEL: trunc_packus_v4i32_v4i8: 3964; SSE: # %bb.0: 3965; SSE-NEXT: packssdw %xmm0, %xmm0 3966; SSE-NEXT: packuswb %xmm0, %xmm0 3967; SSE-NEXT: retq 3968; 3969; AVX-LABEL: trunc_packus_v4i32_v4i8: 3970; AVX: # %bb.0: 3971; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 3972; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3973; AVX-NEXT: retq 3974; 3975; AVX512-LABEL: trunc_packus_v4i32_v4i8: 3976; AVX512: # %bb.0: 3977; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 3978; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3979; AVX512-NEXT: retq 3980; 3981; SKX-LABEL: trunc_packus_v4i32_v4i8: 3982; SKX: # %bb.0: 3983; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 3984; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3985; SKX-NEXT: retq 3986 %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255> 3987 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255> 3988 %3 = icmp sgt <4 x i32> %2, zeroinitializer 3989 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 3990 %5 = trunc <4 x i32> %4 to <4 x i8> 3991 ret <4 x i8> %5 3992} 3993 3994define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { 3995; SSE-LABEL: trunc_packus_v4i32_v4i8_store: 3996; SSE: # %bb.0: 3997; SSE-NEXT: packssdw %xmm0, %xmm0 3998; SSE-NEXT: packuswb %xmm0, %xmm0 3999; SSE-NEXT: movd %xmm0, (%rdi) 4000; SSE-NEXT: retq 4001; 4002; AVX-LABEL: trunc_packus_v4i32_v4i8_store: 4003; AVX: # %bb.0: 4004; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 4005; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4006; AVX-NEXT: vmovd %xmm0, (%rdi) 4007; AVX-NEXT: retq 4008; 4009; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store: 4010; AVX512F: # %bb.0: 4011; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 4012; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4013; AVX512F-NEXT: vmovd %xmm0, (%rdi) 4014; AVX512F-NEXT: retq 4015; 4016; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store: 4017; AVX512VL: # %bb.0: 4018; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4019; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 4020; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) 4021; AVX512VL-NEXT: retq 4022; 4023; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: 4024; AVX512BW: # %bb.0: 4025; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 4026; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4027; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 4028; AVX512BW-NEXT: retq 4029; 4030; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store: 4031; AVX512BWVL: # %bb.0: 4032; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4033; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 4034; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) 4035; AVX512BWVL-NEXT: retq 4036; 4037; SKX-LABEL: trunc_packus_v4i32_v4i8_store: 4038; SKX: # %bb.0: 4039; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4040; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 4041; SKX-NEXT: vpmovusdb %xmm0, (%rdi) 4042; SKX-NEXT: retq 4043 %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255> 4044 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255> 4045 %3 = icmp sgt <4 x i32> %2, zeroinitializer 4046 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 4047 %5 = trunc <4 x i32> %4 to <4 x i8> 4048 store <4 x i8> %5, ptr%p1 4049 ret void 4050} 4051 4052define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { 4053; SSE-LABEL: trunc_packus_v8i32_v8i8: 4054; SSE: # %bb.0: 4055; SSE-NEXT: packssdw %xmm1, %xmm0 4056; SSE-NEXT: packuswb %xmm0, %xmm0 4057; SSE-NEXT: retq 4058; 4059; AVX1-LABEL: trunc_packus_v8i32_v8i8: 4060; AVX1: # %bb.0: 4061; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4062; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4063; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4064; AVX1-NEXT: vzeroupper 4065; AVX1-NEXT: retq 4066; 4067; AVX2-LABEL: trunc_packus_v8i32_v8i8: 4068; AVX2: # %bb.0: 4069; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4070; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4071; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4072; AVX2-NEXT: vzeroupper 4073; AVX2-NEXT: retq 4074; 4075; AVX512F-LABEL: trunc_packus_v8i32_v8i8: 4076; AVX512F: # %bb.0: 4077; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 4078; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4079; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4080; AVX512F-NEXT: vzeroupper 4081; AVX512F-NEXT: retq 4082; 4083; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: 4084; AVX512VL: # %bb.0: 4085; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4086; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4087; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 4088; AVX512VL-NEXT: vzeroupper 4089; AVX512VL-NEXT: retq 4090; 4091; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: 4092; AVX512BW: # %bb.0: 4093; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 4094; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4095; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4096; AVX512BW-NEXT: vzeroupper 4097; AVX512BW-NEXT: retq 4098; 4099; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: 4100; AVX512BWVL: # %bb.0: 4101; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4102; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4103; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 4104; AVX512BWVL-NEXT: vzeroupper 4105; AVX512BWVL-NEXT: retq 4106; 4107; SKX-LABEL: trunc_packus_v8i32_v8i8: 4108; SKX: # %bb.0: 4109; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4110; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4111; SKX-NEXT: vpmovusdb %ymm0, %xmm0 4112; SKX-NEXT: vzeroupper 4113; SKX-NEXT: retq 4114 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4115 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4116 %3 = icmp sgt <8 x i32> %2, zeroinitializer 4117 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 4118 %5 = trunc <8 x i32> %4 to <8 x i8> 4119 ret <8 x i8> %5 4120} 4121 4122define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { 4123; SSE-LABEL: trunc_packus_v8i32_v8i8_store: 4124; SSE: # %bb.0: 4125; SSE-NEXT: packssdw %xmm1, %xmm0 4126; SSE-NEXT: packuswb %xmm0, %xmm0 4127; SSE-NEXT: movq %xmm0, (%rdi) 4128; SSE-NEXT: retq 4129; 4130; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: 4131; AVX1: # %bb.0: 4132; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4133; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4134; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4135; AVX1-NEXT: vmovq %xmm0, (%rdi) 4136; AVX1-NEXT: vzeroupper 4137; AVX1-NEXT: retq 4138; 4139; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: 4140; AVX2: # %bb.0: 4141; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4142; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4143; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4144; AVX2-NEXT: vmovq %xmm0, (%rdi) 4145; AVX2-NEXT: vzeroupper 4146; AVX2-NEXT: retq 4147; 4148; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: 4149; AVX512F: # %bb.0: 4150; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 4151; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4152; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4153; AVX512F-NEXT: vmovq %xmm0, (%rdi) 4154; AVX512F-NEXT: vzeroupper 4155; AVX512F-NEXT: retq 4156; 4157; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: 4158; AVX512VL: # %bb.0: 4159; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4160; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4161; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) 4162; AVX512VL-NEXT: vzeroupper 4163; AVX512VL-NEXT: retq 4164; 4165; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: 4166; AVX512BW: # %bb.0: 4167; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 4168; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 4169; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4170; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 4171; AVX512BW-NEXT: vzeroupper 4172; AVX512BW-NEXT: retq 4173; 4174; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: 4175; AVX512BWVL: # %bb.0: 4176; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4177; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4178; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) 4179; AVX512BWVL-NEXT: vzeroupper 4180; AVX512BWVL-NEXT: retq 4181; 4182; SKX-LABEL: trunc_packus_v8i32_v8i8_store: 4183; SKX: # %bb.0: 4184; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4185; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 4186; SKX-NEXT: vpmovusdb %ymm0, (%rdi) 4187; SKX-NEXT: vzeroupper 4188; SKX-NEXT: retq 4189 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4190 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4191 %3 = icmp sgt <8 x i32> %2, zeroinitializer 4192 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 4193 %5 = trunc <8 x i32> %4 to <8 x i8> 4194 store <8 x i8> %5, ptr%p1 4195 ret void 4196} 4197 4198define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256" { 4199; SSE-LABEL: trunc_packus_v16i32_v16i8: 4200; SSE: # %bb.0: 4201; SSE-NEXT: movdqa (%rdi), %xmm0 4202; SSE-NEXT: movdqa 32(%rdi), %xmm1 4203; SSE-NEXT: packssdw 48(%rdi), %xmm1 4204; SSE-NEXT: packssdw 16(%rdi), %xmm0 4205; SSE-NEXT: packuswb %xmm1, %xmm0 4206; SSE-NEXT: retq 4207; 4208; AVX1-LABEL: trunc_packus_v16i32_v16i8: 4209; AVX1: # %bb.0: 4210; AVX1-NEXT: vmovdqa (%rdi), %xmm0 4211; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 4212; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 4213; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 4214; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4215; AVX1-NEXT: retq 4216; 4217; AVX2-LABEL: trunc_packus_v16i32_v16i8: 4218; AVX2: # %bb.0: 4219; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4220; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 4221; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4222; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4223; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4224; AVX2-NEXT: vzeroupper 4225; AVX2-NEXT: retq 4226; 4227; AVX512-LABEL: trunc_packus_v16i32_v16i8: 4228; AVX512: # %bb.0: 4229; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 4230; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 4231; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 4232; AVX512-NEXT: vzeroupper 4233; AVX512-NEXT: retq 4234; 4235; SKX-LABEL: trunc_packus_v16i32_v16i8: 4236; SKX: # %bb.0: 4237; SKX-NEXT: vmovdqa (%rdi), %ymm0 4238; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 4239; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4240; SKX-NEXT: vpmovuswb %ymm0, %xmm0 4241; SKX-NEXT: vzeroupper 4242; SKX-NEXT: retq 4243 %a0 = load <16 x i32>, ptr %p0 4244 %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4245 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4246 %3 = icmp sgt <16 x i32> %2, zeroinitializer 4247 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 4248 %5 = trunc <16 x i32> %4 to <16 x i8> 4249 ret <16 x i8> %5 4250} 4251 4252define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-width"="256" { 4253; SSE-LABEL: trunc_packus_v16i32_v16i8_store: 4254; SSE: # %bb.0: 4255; SSE-NEXT: movdqa (%rdi), %xmm0 4256; SSE-NEXT: movdqa 32(%rdi), %xmm1 4257; SSE-NEXT: packssdw 48(%rdi), %xmm1 4258; SSE-NEXT: packssdw 16(%rdi), %xmm0 4259; SSE-NEXT: packuswb %xmm1, %xmm0 4260; SSE-NEXT: movdqa %xmm0, (%rsi) 4261; SSE-NEXT: retq 4262; 4263; AVX1-LABEL: trunc_packus_v16i32_v16i8_store: 4264; AVX1: # %bb.0: 4265; AVX1-NEXT: vmovdqa (%rdi), %xmm0 4266; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 4267; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 4268; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 4269; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4270; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 4271; AVX1-NEXT: retq 4272; 4273; AVX2-LABEL: trunc_packus_v16i32_v16i8_store: 4274; AVX2: # %bb.0: 4275; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4276; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 4277; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4278; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4279; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4280; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 4281; AVX2-NEXT: vzeroupper 4282; AVX2-NEXT: retq 4283; 4284; AVX512-LABEL: trunc_packus_v16i32_v16i8_store: 4285; AVX512: # %bb.0: 4286; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 4287; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 4288; AVX512-NEXT: vpmovusdb %zmm0, (%rsi) 4289; AVX512-NEXT: vzeroupper 4290; AVX512-NEXT: retq 4291; 4292; SKX-LABEL: trunc_packus_v16i32_v16i8_store: 4293; SKX: # %bb.0: 4294; SKX-NEXT: vmovdqa (%rdi), %ymm0 4295; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 4296; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4297; SKX-NEXT: vpmovuswb %ymm0, (%rsi) 4298; SKX-NEXT: vzeroupper 4299; SKX-NEXT: retq 4300 %a = load <16 x i32>, ptr %p0 4301 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4302 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4303 %d = icmp sgt <16 x i32> %c, zeroinitializer 4304 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 4305 %f = trunc <16 x i32> %e to <16 x i8> 4306 store <16 x i8> %f, ptr %p1 4307 ret void 4308} 4309 4310define <8 x i8> @trunc_packus_v8i16_v8i8(<8 x i16> %a0) { 4311; SSE-LABEL: trunc_packus_v8i16_v8i8: 4312; SSE: # %bb.0: 4313; SSE-NEXT: packuswb %xmm0, %xmm0 4314; SSE-NEXT: retq 4315; 4316; AVX-LABEL: trunc_packus_v8i16_v8i8: 4317; AVX: # %bb.0: 4318; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4319; AVX-NEXT: retq 4320; 4321; AVX512-LABEL: trunc_packus_v8i16_v8i8: 4322; AVX512: # %bb.0: 4323; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4324; AVX512-NEXT: retq 4325; 4326; SKX-LABEL: trunc_packus_v8i16_v8i8: 4327; SKX: # %bb.0: 4328; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4329; SKX-NEXT: retq 4330 %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4331 %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4332 %3 = icmp sgt <8 x i16> %2, zeroinitializer 4333 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 4334 %5 = trunc <8 x i16> %4 to <8 x i8> 4335 ret <8 x i8> %5 4336} 4337 4338define void @trunc_packus_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { 4339; SSE-LABEL: trunc_packus_v8i16_v8i8_store: 4340; SSE: # %bb.0: 4341; SSE-NEXT: packuswb %xmm0, %xmm0 4342; SSE-NEXT: movq %xmm0, (%rdi) 4343; SSE-NEXT: retq 4344; 4345; AVX-LABEL: trunc_packus_v8i16_v8i8_store: 4346; AVX: # %bb.0: 4347; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4348; AVX-NEXT: vmovq %xmm0, (%rdi) 4349; AVX-NEXT: retq 4350; 4351; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store: 4352; AVX512F: # %bb.0: 4353; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4354; AVX512F-NEXT: vmovq %xmm0, (%rdi) 4355; AVX512F-NEXT: retq 4356; 4357; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store: 4358; AVX512VL: # %bb.0: 4359; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4360; AVX512VL-NEXT: vmovq %xmm0, (%rdi) 4361; AVX512VL-NEXT: retq 4362; 4363; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store: 4364; AVX512BW: # %bb.0: 4365; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4366; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 4367; AVX512BW-NEXT: retq 4368; 4369; AVX512BWVL-LABEL: trunc_packus_v8i16_v8i8_store: 4370; AVX512BWVL: # %bb.0: 4371; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4372; AVX512BWVL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 4373; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) 4374; AVX512BWVL-NEXT: retq 4375; 4376; SKX-LABEL: trunc_packus_v8i16_v8i8_store: 4377; SKX: # %bb.0: 4378; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4379; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 4380; SKX-NEXT: vpmovuswb %xmm0, (%rdi) 4381; SKX-NEXT: retq 4382 %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4383 %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4384 %3 = icmp sgt <8 x i16> %2, zeroinitializer 4385 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 4386 %5 = trunc <8 x i16> %4 to <8 x i8> 4387 store <8 x i8> %5, ptr%p1 4388 ret void 4389} 4390 4391define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { 4392; SSE-LABEL: trunc_packus_v16i16_v16i8: 4393; SSE: # %bb.0: 4394; SSE-NEXT: packuswb %xmm1, %xmm0 4395; SSE-NEXT: retq 4396; 4397; AVX1-LABEL: trunc_packus_v16i16_v16i8: 4398; AVX1: # %bb.0: 4399; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4400; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4401; AVX1-NEXT: vzeroupper 4402; AVX1-NEXT: retq 4403; 4404; AVX2-LABEL: trunc_packus_v16i16_v16i8: 4405; AVX2: # %bb.0: 4406; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4407; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4408; AVX2-NEXT: vzeroupper 4409; AVX2-NEXT: retq 4410; 4411; AVX512F-LABEL: trunc_packus_v16i16_v16i8: 4412; AVX512F: # %bb.0: 4413; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 4414; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4415; AVX512F-NEXT: vzeroupper 4416; AVX512F-NEXT: retq 4417; 4418; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: 4419; AVX512VL: # %bb.0: 4420; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 4421; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4422; AVX512VL-NEXT: vzeroupper 4423; AVX512VL-NEXT: retq 4424; 4425; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: 4426; AVX512BW: # %bb.0: 4427; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 4428; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4429; AVX512BW-NEXT: vzeroupper 4430; AVX512BW-NEXT: retq 4431; 4432; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: 4433; AVX512BWVL: # %bb.0: 4434; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 4435; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 4436; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 4437; AVX512BWVL-NEXT: vzeroupper 4438; AVX512BWVL-NEXT: retq 4439; 4440; SKX-LABEL: trunc_packus_v16i16_v16i8: 4441; SKX: # %bb.0: 4442; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4443; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 4444; SKX-NEXT: vpmovuswb %ymm0, %xmm0 4445; SKX-NEXT: vzeroupper 4446; SKX-NEXT: retq 4447 %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4448 %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4449 %3 = icmp sgt <16 x i16> %2, zeroinitializer 4450 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 4451 %5 = trunc <16 x i16> %4 to <16 x i8> 4452 ret <16 x i8> %5 4453} 4454 4455define <32 x i8> @trunc_packus_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256" { 4456; SSE-LABEL: trunc_packus_v32i16_v32i8: 4457; SSE: # %bb.0: 4458; SSE-NEXT: movdqa (%rdi), %xmm0 4459; SSE-NEXT: movdqa 32(%rdi), %xmm1 4460; SSE-NEXT: packuswb 16(%rdi), %xmm0 4461; SSE-NEXT: packuswb 48(%rdi), %xmm1 4462; SSE-NEXT: retq 4463; 4464; AVX1-LABEL: trunc_packus_v32i16_v32i8: 4465; AVX1: # %bb.0: 4466; AVX1-NEXT: vmovdqa (%rdi), %xmm0 4467; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 4468; AVX1-NEXT: vpackuswb 48(%rdi), %xmm1, %xmm1 4469; AVX1-NEXT: vpackuswb 16(%rdi), %xmm0, %xmm0 4470; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 4471; AVX1-NEXT: retq 4472; 4473; AVX2-LABEL: trunc_packus_v32i16_v32i8: 4474; AVX2: # %bb.0: 4475; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4476; AVX2-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 4477; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4478; AVX2-NEXT: retq 4479; 4480; AVX512F-LABEL: trunc_packus_v32i16_v32i8: 4481; AVX512F: # %bb.0: 4482; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4483; AVX512F-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 4484; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4485; AVX512F-NEXT: retq 4486; 4487; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: 4488; AVX512VL: # %bb.0: 4489; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 4490; AVX512VL-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 4491; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4492; AVX512VL-NEXT: retq 4493; 4494; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: 4495; AVX512BW: # %bb.0: 4496; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 4497; AVX512BW-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 4498; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 4499; AVX512BW-NEXT: retq 4500; 4501; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: 4502; AVX512BWVL: # %bb.0: 4503; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 4504; AVX512BWVL-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 4505; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 4506; AVX512BWVL-NEXT: retq 4507; 4508; SKX-LABEL: trunc_packus_v32i16_v32i8: 4509; SKX: # %bb.0: 4510; SKX-NEXT: vmovdqa (%rdi), %ymm0 4511; SKX-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 4512; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4513; SKX-NEXT: retq 4514 %a0 = load <32 x i16>, ptr %p0 4515 %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4516 %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 4517 %3 = icmp sgt <32 x i16> %2, zeroinitializer 4518 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 4519 %5 = trunc <32 x i16> %4 to <32 x i8> 4520 ret <32 x i8> %5 4521} 4522 4523define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256" { 4524; SSE-LABEL: trunc_packus_v32i32_v32i8: 4525; SSE: # %bb.0: 4526; SSE-NEXT: movdqa (%rdi), %xmm0 4527; SSE-NEXT: movdqa 32(%rdi), %xmm2 4528; SSE-NEXT: movdqa 64(%rdi), %xmm1 4529; SSE-NEXT: movdqa 96(%rdi), %xmm3 4530; SSE-NEXT: packssdw 48(%rdi), %xmm2 4531; SSE-NEXT: packssdw 16(%rdi), %xmm0 4532; SSE-NEXT: packuswb %xmm2, %xmm0 4533; SSE-NEXT: packssdw 112(%rdi), %xmm3 4534; SSE-NEXT: packssdw 80(%rdi), %xmm1 4535; SSE-NEXT: packuswb %xmm3, %xmm1 4536; SSE-NEXT: retq 4537; 4538; AVX1-LABEL: trunc_packus_v32i32_v32i8: 4539; AVX1: # %bb.0: 4540; AVX1-NEXT: vmovdqa (%rdi), %xmm0 4541; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 4542; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 4543; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 4544; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 4545; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 4546; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 4547; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 4548; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 4549; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4550; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 4551; AVX1-NEXT: retq 4552; 4553; AVX2-LABEL: trunc_packus_v32i32_v32i8: 4554; AVX2: # %bb.0: 4555; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4556; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 4557; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 4558; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 4559; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 4560; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4561; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 4562; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4563; AVX2-NEXT: retq 4564; 4565; AVX512-LABEL: trunc_packus_v32i32_v32i8: 4566; AVX512: # %bb.0: 4567; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 4568; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm1 4569; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 4570; AVX512-NEXT: vpmaxsd 64(%rdi), %zmm0, %zmm0 4571; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 4572; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 4573; AVX512-NEXT: retq 4574; 4575; SKX-LABEL: trunc_packus_v32i32_v32i8: 4576; SKX: # %bb.0: 4577; SKX-NEXT: vmovdqa (%rdi), %ymm0 4578; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 4579; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 4580; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 4581; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 4582; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4583; SKX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 4584; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4585; SKX-NEXT: retq 4586 %a0 = load <32 x i32>, ptr %p0 4587 %1 = icmp slt <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4588 %2 = select <32 x i1> %1, <32 x i32> %a0, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 4589 %3 = icmp sgt <32 x i32> %2, zeroinitializer 4590 %4 = select <32 x i1> %3, <32 x i32> %2, <32 x i32> zeroinitializer 4591 %5 = trunc <32 x i32> %4 to <32 x i8> 4592 ret <32 x i8> %5 4593} 4594