1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP,XOP-FALLBACK 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512VL-FALLBACK 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW-FALLBACK 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512VLBW 13 14; These test cases are inspired by C++2a std::midpoint(). 15; See https://bugs.llvm.org/show_bug.cgi?id=40965 16 17; Using 128-bit vector regs. 18 19; ---------------------------------------------------------------------------- ; 20; 32-bit width. 128 / 32 = 4 elts. 21; ---------------------------------------------------------------------------- ; 22 23; Values come from regs 24 25define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind { 26; SSE2-LABEL: vec128_i32_signed_reg_reg: 27; SSE2: # %bb.0: 28; SSE2-NEXT: movdqa %xmm0, %xmm2 29; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 30; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] 31; SSE2-NEXT: por %xmm2, %xmm3 32; SSE2-NEXT: movdqa %xmm0, %xmm4 33; SSE2-NEXT: psubd %xmm1, %xmm4 34; SSE2-NEXT: pxor %xmm2, %xmm4 35; SSE2-NEXT: psubd %xmm4, %xmm2 36; SSE2-NEXT: psrld $1, %xmm2 37; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 38; SSE2-NEXT: pmuludq %xmm3, %xmm2 39; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 40; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 41; SSE2-NEXT: pmuludq %xmm1, %xmm3 42; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 43; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 44; SSE2-NEXT: paddd %xmm2, %xmm0 45; SSE2-NEXT: retq 46; 47; SSE41-LABEL: vec128_i32_signed_reg_reg: 48; SSE41: # %bb.0: 49; SSE41-NEXT: movdqa %xmm0, %xmm2 50; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 51; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 52; SSE41-NEXT: movdqa %xmm0, %xmm3 53; SSE41-NEXT: pminsd %xmm1, %xmm3 54; SSE41-NEXT: pmaxsd %xmm0, %xmm1 55; SSE41-NEXT: psubd %xmm3, %xmm1 56; SSE41-NEXT: psrld $1, %xmm1 57; SSE41-NEXT: pmulld %xmm1, %xmm2 58; SSE41-NEXT: paddd %xmm2, %xmm0 59; SSE41-NEXT: retq 60; 61; AVX1-LABEL: vec128_i32_signed_reg_reg: 62; AVX1: # %bb.0: 63; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 64; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 65; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 66; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 67; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 68; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 69; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 70; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 71; AVX1-NEXT: retq 72; 73; AVX2-LABEL: vec128_i32_signed_reg_reg: 74; AVX2: # %bb.0: 75; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 76; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 77; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 78; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 79; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 80; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 81; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 82; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 83; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 84; AVX2-NEXT: retq 85; 86; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_reg: 87; XOP-FALLBACK: # %bb.0: 88; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 89; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 90; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 91; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 92; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 93; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 94; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 95; XOP-FALLBACK-NEXT: retq 96; 97; XOPAVX1-LABEL: vec128_i32_signed_reg_reg: 98; XOPAVX1: # %bb.0: 99; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 100; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 101; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 102; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 103; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 104; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 105; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 106; XOPAVX1-NEXT: retq 107; 108; XOPAVX2-LABEL: vec128_i32_signed_reg_reg: 109; XOPAVX2: # %bb.0: 110; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 111; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 112; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 113; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 114; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 115; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 116; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 117; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 118; XOPAVX2-NEXT: retq 119; 120; AVX512F-LABEL: vec128_i32_signed_reg_reg: 121; AVX512F: # %bb.0: 122; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 123; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 124; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 125; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 126; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 127; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 128; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2 129; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 130; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1 131; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 132; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1 133; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0 134; AVX512F-NEXT: vzeroupper 135; AVX512F-NEXT: retq 136; 137; AVX512VL-LABEL: vec128_i32_signed_reg_reg: 138; AVX512VL: # %bb.0: 139; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 140; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2 141; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 142; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1 143; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 144; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 145; AVX512VL-NEXT: vpsubd %xmm1, %xmm2, %xmm1 {%k1} 146; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 147; AVX512VL-NEXT: retq 148; 149; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_reg: 150; AVX512BW-FALLBACK: # %bb.0: 151; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 152; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 153; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 154; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 155; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 156; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 157; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 158; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 159; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1 160; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 161; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1 162; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 163; AVX512BW-FALLBACK-NEXT: vzeroupper 164; AVX512BW-FALLBACK-NEXT: retq 165 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed 166 %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 167 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1 168 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2 169 %t7 = sub <4 x i32> %t6, %t5 170 %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1> 171 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed 172 %a10 = add nsw <4 x i32> %t9, %a1 ; signed 173 ret <4 x i32> %a10 174} 175 176define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwind { 177; SSE2-LABEL: vec128_i32_unsigned_reg_reg: 178; SSE2: # %bb.0: 179; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 180; SSE2-NEXT: movdqa %xmm0, %xmm3 181; SSE2-NEXT: psubd %xmm1, %xmm3 182; SSE2-NEXT: pxor %xmm2, %xmm1 183; SSE2-NEXT: pxor %xmm0, %xmm2 184; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 185; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] 186; SSE2-NEXT: por %xmm2, %xmm1 187; SSE2-NEXT: pxor %xmm2, %xmm3 188; SSE2-NEXT: psubd %xmm3, %xmm2 189; SSE2-NEXT: psrld $1, %xmm2 190; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 191; SSE2-NEXT: pmuludq %xmm1, %xmm2 192; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 193; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 194; SSE2-NEXT: pmuludq %xmm3, %xmm1 195; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 196; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 197; SSE2-NEXT: paddd %xmm2, %xmm0 198; SSE2-NEXT: retq 199; 200; SSE41-LABEL: vec128_i32_unsigned_reg_reg: 201; SSE41: # %bb.0: 202; SSE41-NEXT: movdqa %xmm0, %xmm2 203; SSE41-NEXT: pminud %xmm1, %xmm2 204; SSE41-NEXT: movdqa %xmm0, %xmm3 205; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 206; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 207; SSE41-NEXT: pxor %xmm3, %xmm4 208; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 209; SSE41-NEXT: pmaxud %xmm0, %xmm1 210; SSE41-NEXT: psubd %xmm2, %xmm1 211; SSE41-NEXT: psrld $1, %xmm1 212; SSE41-NEXT: pmulld %xmm1, %xmm4 213; SSE41-NEXT: paddd %xmm4, %xmm0 214; SSE41-NEXT: retq 215; 216; AVX1-LABEL: vec128_i32_unsigned_reg_reg: 217; AVX1: # %bb.0: 218; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 219; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 220; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 221; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 222; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 223; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 224; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 225; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 226; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 227; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 228; AVX1-NEXT: retq 229; 230; AVX2-LABEL: vec128_i32_unsigned_reg_reg: 231; AVX2: # %bb.0: 232; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2 233; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 234; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 235; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 236; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,1,1,1] 237; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 238; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 239; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 240; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 241; AVX2-NEXT: vpmulld %xmm3, %xmm1, %xmm1 242; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 243; AVX2-NEXT: retq 244; 245; XOP-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg: 246; XOP-FALLBACK: # %bb.0: 247; XOP-FALLBACK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 248; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 249; XOP-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm3 250; XOP-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 251; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 252; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 253; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 254; XOP-FALLBACK-NEXT: retq 255; 256; XOPAVX1-LABEL: vec128_i32_unsigned_reg_reg: 257; XOPAVX1: # %bb.0: 258; XOPAVX1-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 259; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 260; XOPAVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3 261; XOPAVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 262; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 263; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 264; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 265; XOPAVX1-NEXT: retq 266; 267; XOPAVX2-LABEL: vec128_i32_unsigned_reg_reg: 268; XOPAVX2: # %bb.0: 269; XOPAVX2-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 270; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 271; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 272; XOPAVX2-NEXT: vpminud %xmm1, %xmm0, %xmm3 273; XOPAVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 274; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 275; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 276; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 277; XOPAVX2-NEXT: retq 278; 279; AVX512F-LABEL: vec128_i32_unsigned_reg_reg: 280; AVX512F: # %bb.0: 281; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 282; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 283; AVX512F-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 284; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 285; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 286; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 287; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm2 288; AVX512F-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 289; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1 290; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 291; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1 292; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0 293; AVX512F-NEXT: vzeroupper 294; AVX512F-NEXT: retq 295; 296; AVX512VL-LABEL: vec128_i32_unsigned_reg_reg: 297; AVX512VL: # %bb.0: 298; AVX512VL-NEXT: vpcmpnleud %xmm1, %xmm0, %k1 299; AVX512VL-NEXT: vpminud %xmm1, %xmm0, %xmm2 300; AVX512VL-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 301; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1 302; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 303; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 304; AVX512VL-NEXT: vpsubd %xmm1, %xmm2, %xmm1 {%k1} 305; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 306; AVX512VL-NEXT: retq 307; 308; AVX512BW-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg: 309; AVX512BW-FALLBACK: # %bb.0: 310; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 311; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 312; AVX512BW-FALLBACK-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 313; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 314; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 315; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 316; AVX512BW-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm2 317; AVX512BW-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 318; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1 319; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 320; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1 321; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 322; AVX512BW-FALLBACK-NEXT: vzeroupper 323; AVX512BW-FALLBACK-NEXT: retq 324 %t3 = icmp ugt <4 x i32> %a1, %a2 325 %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 326 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1 327 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2 328 %t7 = sub <4 x i32> %t6, %t5 329 %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1> 330 %t9 = mul <4 x i32> %t8, %t4 331 %a10 = add <4 x i32> %t9, %a1 332 ret <4 x i32> %a10 333} 334 335; Values are loaded. Only check signed case. 336 337define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwind { 338; SSE2-LABEL: vec128_i32_signed_mem_reg: 339; SSE2: # %bb.0: 340; SSE2-NEXT: movdqa (%rdi), %xmm1 341; SSE2-NEXT: movdqa %xmm1, %xmm2 342; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 343; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] 344; SSE2-NEXT: por %xmm2, %xmm3 345; SSE2-NEXT: movdqa %xmm1, %xmm4 346; SSE2-NEXT: psubd %xmm0, %xmm4 347; SSE2-NEXT: pxor %xmm2, %xmm4 348; SSE2-NEXT: psubd %xmm4, %xmm2 349; SSE2-NEXT: psrld $1, %xmm2 350; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 351; SSE2-NEXT: pmuludq %xmm3, %xmm2 352; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 353; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 354; SSE2-NEXT: pmuludq %xmm4, %xmm2 355; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 356; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 357; SSE2-NEXT: paddd %xmm1, %xmm0 358; SSE2-NEXT: retq 359; 360; SSE41-LABEL: vec128_i32_signed_mem_reg: 361; SSE41: # %bb.0: 362; SSE41-NEXT: movdqa (%rdi), %xmm1 363; SSE41-NEXT: movdqa %xmm1, %xmm2 364; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 365; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 366; SSE41-NEXT: movdqa %xmm1, %xmm3 367; SSE41-NEXT: pminsd %xmm0, %xmm3 368; SSE41-NEXT: pmaxsd %xmm1, %xmm0 369; SSE41-NEXT: psubd %xmm3, %xmm0 370; SSE41-NEXT: psrld $1, %xmm0 371; SSE41-NEXT: pmulld %xmm2, %xmm0 372; SSE41-NEXT: paddd %xmm1, %xmm0 373; SSE41-NEXT: retq 374; 375; AVX1-LABEL: vec128_i32_signed_mem_reg: 376; AVX1: # %bb.0: 377; AVX1-NEXT: vmovdqa (%rdi), %xmm1 378; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 379; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 380; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 381; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 382; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 383; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 384; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 385; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 386; AVX1-NEXT: retq 387; 388; AVX2-LABEL: vec128_i32_signed_mem_reg: 389; AVX2: # %bb.0: 390; AVX2-NEXT: vmovdqa (%rdi), %xmm1 391; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 392; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 393; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 394; AVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3 395; AVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 396; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 397; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 398; AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 399; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 400; AVX2-NEXT: retq 401; 402; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_reg: 403; XOP-FALLBACK: # %bb.0: 404; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 405; XOP-FALLBACK-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 406; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 407; XOP-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm3 408; XOP-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 409; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm0, %xmm0 410; XOP-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0 411; XOP-FALLBACK-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 412; XOP-FALLBACK-NEXT: retq 413; 414; XOPAVX1-LABEL: vec128_i32_signed_mem_reg: 415; XOPAVX1: # %bb.0: 416; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 417; XOPAVX1-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 418; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 419; XOPAVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 420; XOPAVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 421; XOPAVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 422; XOPAVX1-NEXT: vpsrld $1, %xmm0, %xmm0 423; XOPAVX1-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 424; XOPAVX1-NEXT: retq 425; 426; XOPAVX2-LABEL: vec128_i32_signed_mem_reg: 427; XOPAVX2: # %bb.0: 428; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 429; XOPAVX2-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 430; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 431; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 432; XOPAVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3 433; XOPAVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 434; XOPAVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 435; XOPAVX2-NEXT: vpsrld $1, %xmm0, %xmm0 436; XOPAVX2-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 437; XOPAVX2-NEXT: retq 438; 439; AVX512F-LABEL: vec128_i32_signed_mem_reg: 440; AVX512F: # %bb.0: 441; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 442; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 443; AVX512F-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 444; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 445; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 446; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 447; AVX512F-NEXT: vpminsd %xmm0, %xmm1, %xmm2 448; AVX512F-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 449; AVX512F-NEXT: vpsubd %xmm2, %xmm0, %xmm0 450; AVX512F-NEXT: vpsrld $1, %xmm0, %xmm0 451; AVX512F-NEXT: vpmulld %xmm3, %xmm0, %xmm0 452; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 453; AVX512F-NEXT: vzeroupper 454; AVX512F-NEXT: retq 455; 456; AVX512VL-LABEL: vec128_i32_signed_mem_reg: 457; AVX512VL: # %bb.0: 458; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 459; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 460; AVX512VL-NEXT: vpminsd %xmm0, %xmm1, %xmm2 461; AVX512VL-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 462; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 463; AVX512VL-NEXT: vpsrld $1, %xmm0, %xmm0 464; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 465; AVX512VL-NEXT: vpsubd %xmm0, %xmm2, %xmm0 {%k1} 466; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 467; AVX512VL-NEXT: retq 468; 469; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_reg: 470; AVX512BW-FALLBACK: # %bb.0: 471; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 472; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 473; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 474; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 475; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 476; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 477; AVX512BW-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm2 478; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 479; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm0, %xmm0 480; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0 481; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm0, %xmm0 482; AVX512BW-FALLBACK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 483; AVX512BW-FALLBACK-NEXT: vzeroupper 484; AVX512BW-FALLBACK-NEXT: retq 485 %a1 = load <4 x i32>, ptr %a1_addr 486 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed 487 %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 488 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1 489 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2 490 %t7 = sub <4 x i32> %t6, %t5 491 %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1> 492 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed 493 %a10 = add nsw <4 x i32> %t9, %a1 ; signed 494 ret <4 x i32> %a10 495} 496 497define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwind { 498; SSE2-LABEL: vec128_i32_signed_reg_mem: 499; SSE2: # %bb.0: 500; SSE2-NEXT: movdqa (%rdi), %xmm1 501; SSE2-NEXT: movdqa %xmm0, %xmm2 502; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 503; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] 504; SSE2-NEXT: por %xmm2, %xmm3 505; SSE2-NEXT: movdqa %xmm0, %xmm4 506; SSE2-NEXT: psubd %xmm1, %xmm4 507; SSE2-NEXT: pxor %xmm2, %xmm4 508; SSE2-NEXT: psubd %xmm4, %xmm2 509; SSE2-NEXT: psrld $1, %xmm2 510; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 511; SSE2-NEXT: pmuludq %xmm3, %xmm2 512; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 513; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 514; SSE2-NEXT: pmuludq %xmm1, %xmm3 515; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 516; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 517; SSE2-NEXT: paddd %xmm2, %xmm0 518; SSE2-NEXT: retq 519; 520; SSE41-LABEL: vec128_i32_signed_reg_mem: 521; SSE41: # %bb.0: 522; SSE41-NEXT: movdqa (%rdi), %xmm1 523; SSE41-NEXT: movdqa %xmm0, %xmm2 524; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 525; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 526; SSE41-NEXT: movdqa %xmm0, %xmm3 527; SSE41-NEXT: pminsd %xmm1, %xmm3 528; SSE41-NEXT: pmaxsd %xmm0, %xmm1 529; SSE41-NEXT: psubd %xmm3, %xmm1 530; SSE41-NEXT: psrld $1, %xmm1 531; SSE41-NEXT: pmulld %xmm2, %xmm1 532; SSE41-NEXT: paddd %xmm1, %xmm0 533; SSE41-NEXT: retq 534; 535; AVX1-LABEL: vec128_i32_signed_reg_mem: 536; AVX1: # %bb.0: 537; AVX1-NEXT: vmovdqa (%rdi), %xmm1 538; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 539; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 540; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 541; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 542; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 543; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 544; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 545; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 546; AVX1-NEXT: retq 547; 548; AVX2-LABEL: vec128_i32_signed_reg_mem: 549; AVX2: # %bb.0: 550; AVX2-NEXT: vmovdqa (%rdi), %xmm1 551; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 552; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 553; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 554; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 555; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 556; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 557; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 558; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 559; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 560; AVX2-NEXT: retq 561; 562; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_mem: 563; XOP-FALLBACK: # %bb.0: 564; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 565; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 566; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 567; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 568; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 569; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 570; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 571; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 572; XOP-FALLBACK-NEXT: retq 573; 574; XOPAVX1-LABEL: vec128_i32_signed_reg_mem: 575; XOPAVX1: # %bb.0: 576; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 577; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 578; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 579; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 580; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 581; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 582; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 583; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 584; XOPAVX1-NEXT: retq 585; 586; XOPAVX2-LABEL: vec128_i32_signed_reg_mem: 587; XOPAVX2: # %bb.0: 588; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 589; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 590; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 591; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 592; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 593; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 594; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 595; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 596; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 597; XOPAVX2-NEXT: retq 598; 599; AVX512F-LABEL: vec128_i32_signed_reg_mem: 600; AVX512F: # %bb.0: 601; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 602; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 603; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 604; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 605; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 606; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 607; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2 608; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 609; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1 610; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 611; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1 612; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0 613; AVX512F-NEXT: vzeroupper 614; AVX512F-NEXT: retq 615; 616; AVX512VL-LABEL: vec128_i32_signed_reg_mem: 617; AVX512VL: # %bb.0: 618; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 619; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 620; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2 621; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 622; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1 623; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 624; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 625; AVX512VL-NEXT: vpsubd %xmm1, %xmm2, %xmm1 {%k1} 626; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 627; AVX512VL-NEXT: retq 628; 629; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_reg_mem: 630; AVX512BW-FALLBACK: # %bb.0: 631; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 632; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 633; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 634; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 635; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 636; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 637; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 638; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 639; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1 640; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 641; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1 642; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 643; AVX512BW-FALLBACK-NEXT: vzeroupper 644; AVX512BW-FALLBACK-NEXT: retq 645 %a2 = load <4 x i32>, ptr %a2_addr 646 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed 647 %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 648 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1 649 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2 650 %t7 = sub <4 x i32> %t6, %t5 651 %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1> 652 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed 653 %a10 = add nsw <4 x i32> %t9, %a1 ; signed 654 ret <4 x i32> %a10 655} 656 657define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 658; SSE2-LABEL: vec128_i32_signed_mem_mem: 659; SSE2: # %bb.0: 660; SSE2-NEXT: movdqa (%rdi), %xmm1 661; SSE2-NEXT: movdqa (%rsi), %xmm0 662; SSE2-NEXT: movdqa %xmm1, %xmm2 663; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 664; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] 665; SSE2-NEXT: por %xmm2, %xmm3 666; SSE2-NEXT: movdqa %xmm1, %xmm4 667; SSE2-NEXT: psubd %xmm0, %xmm4 668; SSE2-NEXT: pxor %xmm2, %xmm4 669; SSE2-NEXT: psubd %xmm4, %xmm2 670; SSE2-NEXT: psrld $1, %xmm2 671; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 672; SSE2-NEXT: pmuludq %xmm3, %xmm2 673; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 674; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 675; SSE2-NEXT: pmuludq %xmm4, %xmm2 676; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 677; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 678; SSE2-NEXT: paddd %xmm1, %xmm0 679; SSE2-NEXT: retq 680; 681; SSE41-LABEL: vec128_i32_signed_mem_mem: 682; SSE41: # %bb.0: 683; SSE41-NEXT: movdqa (%rdi), %xmm1 684; SSE41-NEXT: movdqa (%rsi), %xmm0 685; SSE41-NEXT: movdqa %xmm1, %xmm2 686; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 687; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 688; SSE41-NEXT: movdqa %xmm1, %xmm3 689; SSE41-NEXT: pminsd %xmm0, %xmm3 690; SSE41-NEXT: pmaxsd %xmm1, %xmm0 691; SSE41-NEXT: psubd %xmm3, %xmm0 692; SSE41-NEXT: psrld $1, %xmm0 693; SSE41-NEXT: pmulld %xmm2, %xmm0 694; SSE41-NEXT: paddd %xmm1, %xmm0 695; SSE41-NEXT: retq 696; 697; AVX1-LABEL: vec128_i32_signed_mem_mem: 698; AVX1: # %bb.0: 699; AVX1-NEXT: vmovdqa (%rdi), %xmm0 700; AVX1-NEXT: vmovdqa (%rsi), %xmm1 701; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 702; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 703; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 704; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 705; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 706; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 707; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 708; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 709; AVX1-NEXT: retq 710; 711; AVX2-LABEL: vec128_i32_signed_mem_mem: 712; AVX2: # %bb.0: 713; AVX2-NEXT: vmovdqa (%rdi), %xmm0 714; AVX2-NEXT: vmovdqa (%rsi), %xmm1 715; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 716; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 717; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 718; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 719; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 720; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 721; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 722; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 723; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 724; AVX2-NEXT: retq 725; 726; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_mem: 727; XOP-FALLBACK: # %bb.0: 728; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 729; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 730; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 731; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 732; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 733; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 734; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 735; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 736; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 737; XOP-FALLBACK-NEXT: retq 738; 739; XOPAVX1-LABEL: vec128_i32_signed_mem_mem: 740; XOPAVX1: # %bb.0: 741; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 742; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1 743; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 744; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 745; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 746; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 747; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 748; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 749; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 750; XOPAVX1-NEXT: retq 751; 752; XOPAVX2-LABEL: vec128_i32_signed_mem_mem: 753; XOPAVX2: # %bb.0: 754; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 755; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 756; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 757; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 758; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 759; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 760; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 761; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 762; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 763; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 764; XOPAVX2-NEXT: retq 765; 766; AVX512F-LABEL: vec128_i32_signed_mem_mem: 767; AVX512F: # %bb.0: 768; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 769; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 770; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 771; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 772; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 773; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 774; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm2 775; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 776; AVX512F-NEXT: vpsubd %xmm2, %xmm1, %xmm1 777; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 778; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm1 779; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0 780; AVX512F-NEXT: vzeroupper 781; AVX512F-NEXT: retq 782; 783; AVX512VL-LABEL: vec128_i32_signed_mem_mem: 784; AVX512VL: # %bb.0: 785; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 786; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 787; AVX512VL-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 788; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm2 789; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 790; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1 791; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 792; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 793; AVX512VL-NEXT: vpsubd %xmm1, %xmm2, %xmm1 {%k1} 794; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 795; AVX512VL-NEXT: retq 796; 797; AVX512BW-FALLBACK-LABEL: vec128_i32_signed_mem_mem: 798; AVX512BW-FALLBACK: # %bb.0: 799; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 800; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 801; AVX512BW-FALLBACK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 802; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 803; AVX512BW-FALLBACK-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] 804; AVX512BW-FALLBACK-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} 805; AVX512BW-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 806; AVX512BW-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 807; AVX512BW-FALLBACK-NEXT: vpsubd %xmm2, %xmm1, %xmm1 808; AVX512BW-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 809; AVX512BW-FALLBACK-NEXT: vpmulld %xmm3, %xmm1, %xmm1 810; AVX512BW-FALLBACK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 811; AVX512BW-FALLBACK-NEXT: vzeroupper 812; AVX512BW-FALLBACK-NEXT: retq 813 %a1 = load <4 x i32>, ptr %a1_addr 814 %a2 = load <4 x i32>, ptr %a2_addr 815 %t3 = icmp sgt <4 x i32> %a1, %a2 ; signed 816 %t4 = select <4 x i1> %t3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 817 %t5 = select <4 x i1> %t3, <4 x i32> %a2, <4 x i32> %a1 818 %t6 = select <4 x i1> %t3, <4 x i32> %a1, <4 x i32> %a2 819 %t7 = sub <4 x i32> %t6, %t5 820 %t8 = lshr <4 x i32> %t7, <i32 1, i32 1, i32 1, i32 1> 821 %t9 = mul nsw <4 x i32> %t8, %t4 ; signed 822 %a10 = add nsw <4 x i32> %t9, %a1 ; signed 823 ret <4 x i32> %a10 824} 825 826; ---------------------------------------------------------------------------- ; 827; 64-bit width. 128 / 64 = 2 elts. 828; ---------------------------------------------------------------------------- ; 829 830; Values come from regs 831 832define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind { 833; SSE2-LABEL: vec128_i64_signed_reg_reg: 834; SSE2: # %bb.0: 835; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 836; SSE2-NEXT: movdqa %xmm0, %xmm3 837; SSE2-NEXT: psubq %xmm1, %xmm3 838; SSE2-NEXT: pxor %xmm2, %xmm1 839; SSE2-NEXT: pxor %xmm0, %xmm2 840; SSE2-NEXT: movdqa %xmm2, %xmm4 841; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 842; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 843; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 844; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 845; SSE2-NEXT: pand %xmm5, %xmm1 846; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 847; SSE2-NEXT: por %xmm1, %xmm2 848; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] 849; SSE2-NEXT: por %xmm2, %xmm1 850; SSE2-NEXT: pxor %xmm2, %xmm3 851; SSE2-NEXT: movdqa %xmm2, %xmm4 852; SSE2-NEXT: psubq %xmm3, %xmm4 853; SSE2-NEXT: movdqa %xmm4, %xmm3 854; SSE2-NEXT: psrlq $1, %xmm3 855; SSE2-NEXT: psrlq $33, %xmm4 856; SSE2-NEXT: pmuludq %xmm1, %xmm4 857; SSE2-NEXT: psrlq $32, %xmm2 858; SSE2-NEXT: pmuludq %xmm3, %xmm2 859; SSE2-NEXT: paddq %xmm4, %xmm2 860; SSE2-NEXT: psllq $32, %xmm2 861; SSE2-NEXT: pmuludq %xmm1, %xmm3 862; SSE2-NEXT: paddq %xmm3, %xmm0 863; SSE2-NEXT: paddq %xmm2, %xmm0 864; SSE2-NEXT: retq 865; 866; SSE41-LABEL: vec128_i64_signed_reg_reg: 867; SSE41: # %bb.0: 868; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 869; SSE41-NEXT: movdqa %xmm0, %xmm3 870; SSE41-NEXT: psubq %xmm1, %xmm3 871; SSE41-NEXT: pxor %xmm2, %xmm1 872; SSE41-NEXT: pxor %xmm0, %xmm2 873; SSE41-NEXT: movdqa %xmm2, %xmm4 874; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 875; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 876; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 877; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 878; SSE41-NEXT: pand %xmm5, %xmm1 879; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 880; SSE41-NEXT: por %xmm1, %xmm2 881; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] 882; SSE41-NEXT: por %xmm2, %xmm1 883; SSE41-NEXT: pxor %xmm2, %xmm3 884; SSE41-NEXT: movdqa %xmm2, %xmm4 885; SSE41-NEXT: psubq %xmm3, %xmm4 886; SSE41-NEXT: movdqa %xmm4, %xmm3 887; SSE41-NEXT: psrlq $1, %xmm3 888; SSE41-NEXT: psrlq $33, %xmm4 889; SSE41-NEXT: pmuludq %xmm1, %xmm4 890; SSE41-NEXT: psrlq $32, %xmm2 891; SSE41-NEXT: pmuludq %xmm3, %xmm2 892; SSE41-NEXT: paddq %xmm4, %xmm2 893; SSE41-NEXT: psllq $32, %xmm2 894; SSE41-NEXT: pmuludq %xmm1, %xmm3 895; SSE41-NEXT: paddq %xmm3, %xmm0 896; SSE41-NEXT: paddq %xmm2, %xmm0 897; SSE41-NEXT: retq 898; 899; AVX-LABEL: vec128_i64_signed_reg_reg: 900; AVX: # %bb.0: 901; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 902; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 903; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 904; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 905; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 906; AVX-NEXT: vpsrlq $1, %xmm1, %xmm4 907; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 908; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 909; AVX-NEXT: vpsrlq $32, %xmm2, %xmm2 910; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 911; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 912; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 913; AVX-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 914; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 915; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 916; AVX-NEXT: retq 917; 918; XOP-LABEL: vec128_i64_signed_reg_reg: 919; XOP: # %bb.0: 920; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 921; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 922; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 923; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 924; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 925; XOP-NEXT: vpsrlq $1, %xmm1, %xmm4 926; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 927; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 928; XOP-NEXT: vpsrlq $32, %xmm2, %xmm2 929; XOP-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 930; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 931; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 932; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 933; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 934; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 935; XOP-NEXT: retq 936; 937; AVX512F-LABEL: vec128_i64_signed_reg_reg: 938; AVX512F: # %bb.0: 939; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 940; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 941; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 942; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 943; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 944; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 945; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 946; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 947; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1 948; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm2 949; AVX512F-NEXT: vpsrlq $33, %xmm1, %xmm1 950; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 951; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4 952; AVX512F-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 953; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 954; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 955; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 956; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 957; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 958; AVX512F-NEXT: vzeroupper 959; AVX512F-NEXT: retq 960; 961; AVX512VL-LABEL: vec128_i64_signed_reg_reg: 962; AVX512VL: # %bb.0: 963; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 964; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2 965; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1 966; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1 967; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 968; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 969; AVX512VL-NEXT: vpsubq %xmm1, %xmm2, %xmm1 {%k1} 970; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 971; AVX512VL-NEXT: retq 972; 973; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_reg: 974; AVX512BW-FALLBACK: # %bb.0: 975; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 976; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 977; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 978; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 979; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 980; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 981; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 982; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 983; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1 984; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 985; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 986; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 987; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 988; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 989; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 990; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 991; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 992; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 993; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 994; AVX512BW-FALLBACK-NEXT: vzeroupper 995; AVX512BW-FALLBACK-NEXT: retq 996 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed 997 %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1> 998 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1 999 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2 1000 %t7 = sub <2 x i64> %t6, %t5 1001 %t8 = lshr <2 x i64> %t7, <i64 1, i64 1> 1002 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed 1003 %a10 = add nsw <2 x i64> %t9, %a1 ; signed 1004 ret <2 x i64> %a10 1005} 1006 1007define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind { 1008; SSE2-LABEL: vec128_i64_unsigned_reg_reg: 1009; SSE2: # %bb.0: 1010; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 1011; SSE2-NEXT: movdqa %xmm0, %xmm3 1012; SSE2-NEXT: psubq %xmm1, %xmm3 1013; SSE2-NEXT: pxor %xmm2, %xmm1 1014; SSE2-NEXT: pxor %xmm0, %xmm2 1015; SSE2-NEXT: movdqa %xmm2, %xmm4 1016; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 1017; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1018; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 1019; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1020; SSE2-NEXT: pand %xmm5, %xmm1 1021; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1022; SSE2-NEXT: por %xmm1, %xmm2 1023; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] 1024; SSE2-NEXT: por %xmm2, %xmm1 1025; SSE2-NEXT: pxor %xmm2, %xmm3 1026; SSE2-NEXT: movdqa %xmm2, %xmm4 1027; SSE2-NEXT: psubq %xmm3, %xmm4 1028; SSE2-NEXT: movdqa %xmm4, %xmm3 1029; SSE2-NEXT: psrlq $1, %xmm3 1030; SSE2-NEXT: psrlq $33, %xmm4 1031; SSE2-NEXT: pmuludq %xmm1, %xmm4 1032; SSE2-NEXT: psrlq $32, %xmm2 1033; SSE2-NEXT: pmuludq %xmm3, %xmm2 1034; SSE2-NEXT: paddq %xmm4, %xmm2 1035; SSE2-NEXT: psllq $32, %xmm2 1036; SSE2-NEXT: pmuludq %xmm1, %xmm3 1037; SSE2-NEXT: paddq %xmm3, %xmm0 1038; SSE2-NEXT: paddq %xmm2, %xmm0 1039; SSE2-NEXT: retq 1040; 1041; SSE41-LABEL: vec128_i64_unsigned_reg_reg: 1042; SSE41: # %bb.0: 1043; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 1044; SSE41-NEXT: movdqa %xmm0, %xmm3 1045; SSE41-NEXT: psubq %xmm1, %xmm3 1046; SSE41-NEXT: pxor %xmm2, %xmm1 1047; SSE41-NEXT: pxor %xmm0, %xmm2 1048; SSE41-NEXT: movdqa %xmm2, %xmm4 1049; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 1050; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1051; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 1052; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1053; SSE41-NEXT: pand %xmm5, %xmm1 1054; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1055; SSE41-NEXT: por %xmm1, %xmm2 1056; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] 1057; SSE41-NEXT: por %xmm2, %xmm1 1058; SSE41-NEXT: pxor %xmm2, %xmm3 1059; SSE41-NEXT: movdqa %xmm2, %xmm4 1060; SSE41-NEXT: psubq %xmm3, %xmm4 1061; SSE41-NEXT: movdqa %xmm4, %xmm3 1062; SSE41-NEXT: psrlq $1, %xmm3 1063; SSE41-NEXT: psrlq $33, %xmm4 1064; SSE41-NEXT: pmuludq %xmm1, %xmm4 1065; SSE41-NEXT: psrlq $32, %xmm2 1066; SSE41-NEXT: pmuludq %xmm3, %xmm2 1067; SSE41-NEXT: paddq %xmm4, %xmm2 1068; SSE41-NEXT: psllq $32, %xmm2 1069; SSE41-NEXT: pmuludq %xmm1, %xmm3 1070; SSE41-NEXT: paddq %xmm3, %xmm0 1071; SSE41-NEXT: paddq %xmm2, %xmm0 1072; SSE41-NEXT: retq 1073; 1074; AVX1-LABEL: vec128_i64_unsigned_reg_reg: 1075; AVX1: # %bb.0: 1076; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 1077; AVX1-NEXT: # xmm2 = mem[0,0] 1078; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 1079; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 1080; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 1081; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1082; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1083; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1084; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1085; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 1086; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 1087; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1088; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 1089; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1090; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1091; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 1092; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1093; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1094; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1095; AVX1-NEXT: retq 1096; 1097; AVX2-LABEL: vec128_i64_unsigned_reg_reg: 1098; AVX2: # %bb.0: 1099; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 1100; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 1101; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 1102; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 1103; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1104; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1105; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 1106; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1107; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm4 1108; AVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 1109; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1110; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 1111; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1112; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1113; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 1114; AVX2-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1115; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1116; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1117; AVX2-NEXT: retq 1118; 1119; XOP-LABEL: vec128_i64_unsigned_reg_reg: 1120; XOP: # %bb.0: 1121; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 1122; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1123; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1124; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 1125; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1126; XOP-NEXT: vpsrlq $1, %xmm1, %xmm4 1127; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 1128; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1129; XOP-NEXT: vpsrlq $32, %xmm2, %xmm2 1130; XOP-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1131; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1132; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 1133; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1134; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1135; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1136; XOP-NEXT: retq 1137; 1138; AVX512F-LABEL: vec128_i64_unsigned_reg_reg: 1139; AVX512F: # %bb.0: 1140; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1141; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1142; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 1143; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1144; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1145; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1146; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 1147; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 1148; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1149; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm2 1150; AVX512F-NEXT: vpsrlq $33, %xmm1, %xmm1 1151; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1152; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4 1153; AVX512F-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1154; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1155; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 1156; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1157; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1158; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1159; AVX512F-NEXT: vzeroupper 1160; AVX512F-NEXT: retq 1161; 1162; AVX512VL-LABEL: vec128_i64_unsigned_reg_reg: 1163; AVX512VL: # %bb.0: 1164; AVX512VL-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 1165; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm2 1166; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm1 1167; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1168; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 1169; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1170; AVX512VL-NEXT: vpsubq %xmm1, %xmm2, %xmm1 {%k1} 1171; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1172; AVX512VL-NEXT: retq 1173; 1174; AVX512BW-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg: 1175; AVX512BW-FALLBACK: # %bb.0: 1176; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1177; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1178; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 1179; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1180; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1181; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1182; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 1183; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 1184; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1185; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 1186; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 1187; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1188; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 1189; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1190; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1191; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 1192; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1193; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1194; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1195; AVX512BW-FALLBACK-NEXT: vzeroupper 1196; AVX512BW-FALLBACK-NEXT: retq 1197 %t3 = icmp ugt <2 x i64> %a1, %a2 1198 %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1> 1199 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1 1200 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2 1201 %t7 = sub <2 x i64> %t6, %t5 1202 %t8 = lshr <2 x i64> %t7, <i64 1, i64 1> 1203 %t9 = mul <2 x i64> %t8, %t4 1204 %a10 = add <2 x i64> %t9, %a1 1205 ret <2 x i64> %a10 1206} 1207 1208; Values are loaded. Only check signed case. 1209 1210define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwind { 1211; SSE2-LABEL: vec128_i64_signed_mem_reg: 1212; SSE2: # %bb.0: 1213; SSE2-NEXT: movdqa (%rdi), %xmm1 1214; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1215; SSE2-NEXT: movdqa %xmm1, %xmm3 1216; SSE2-NEXT: psubq %xmm0, %xmm3 1217; SSE2-NEXT: pxor %xmm2, %xmm0 1218; SSE2-NEXT: pxor %xmm1, %xmm2 1219; SSE2-NEXT: movdqa %xmm2, %xmm4 1220; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 1221; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1222; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 1223; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 1224; SSE2-NEXT: pand %xmm5, %xmm0 1225; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1226; SSE2-NEXT: por %xmm0, %xmm2 1227; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] 1228; SSE2-NEXT: por %xmm2, %xmm4 1229; SSE2-NEXT: pxor %xmm2, %xmm3 1230; SSE2-NEXT: movdqa %xmm2, %xmm5 1231; SSE2-NEXT: psubq %xmm3, %xmm5 1232; SSE2-NEXT: movdqa %xmm5, %xmm0 1233; SSE2-NEXT: psrlq $1, %xmm0 1234; SSE2-NEXT: psrlq $33, %xmm5 1235; SSE2-NEXT: pmuludq %xmm4, %xmm5 1236; SSE2-NEXT: psrlq $32, %xmm2 1237; SSE2-NEXT: pmuludq %xmm0, %xmm2 1238; SSE2-NEXT: paddq %xmm5, %xmm2 1239; SSE2-NEXT: psllq $32, %xmm2 1240; SSE2-NEXT: pmuludq %xmm4, %xmm0 1241; SSE2-NEXT: paddq %xmm1, %xmm0 1242; SSE2-NEXT: paddq %xmm2, %xmm0 1243; SSE2-NEXT: retq 1244; 1245; SSE41-LABEL: vec128_i64_signed_mem_reg: 1246; SSE41: # %bb.0: 1247; SSE41-NEXT: movdqa (%rdi), %xmm1 1248; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 1249; SSE41-NEXT: movdqa %xmm1, %xmm3 1250; SSE41-NEXT: psubq %xmm0, %xmm3 1251; SSE41-NEXT: pxor %xmm2, %xmm0 1252; SSE41-NEXT: pxor %xmm1, %xmm2 1253; SSE41-NEXT: movdqa %xmm2, %xmm4 1254; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 1255; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1256; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 1257; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 1258; SSE41-NEXT: pand %xmm5, %xmm0 1259; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1260; SSE41-NEXT: por %xmm0, %xmm2 1261; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] 1262; SSE41-NEXT: por %xmm2, %xmm4 1263; SSE41-NEXT: pxor %xmm2, %xmm3 1264; SSE41-NEXT: movdqa %xmm2, %xmm5 1265; SSE41-NEXT: psubq %xmm3, %xmm5 1266; SSE41-NEXT: movdqa %xmm5, %xmm0 1267; SSE41-NEXT: psrlq $1, %xmm0 1268; SSE41-NEXT: psrlq $33, %xmm5 1269; SSE41-NEXT: pmuludq %xmm4, %xmm5 1270; SSE41-NEXT: psrlq $32, %xmm2 1271; SSE41-NEXT: pmuludq %xmm0, %xmm2 1272; SSE41-NEXT: paddq %xmm5, %xmm2 1273; SSE41-NEXT: psllq $32, %xmm2 1274; SSE41-NEXT: pmuludq %xmm4, %xmm0 1275; SSE41-NEXT: paddq %xmm1, %xmm0 1276; SSE41-NEXT: paddq %xmm2, %xmm0 1277; SSE41-NEXT: retq 1278; 1279; AVX-LABEL: vec128_i64_signed_mem_reg: 1280; AVX: # %bb.0: 1281; AVX-NEXT: vmovdqa (%rdi), %xmm1 1282; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1283; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1284; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 1285; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 1286; AVX-NEXT: vpsubq %xmm0, %xmm2, %xmm0 1287; AVX-NEXT: vpsrlq $1, %xmm0, %xmm4 1288; AVX-NEXT: vpsrlq $33, %xmm0, %xmm0 1289; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 1290; AVX-NEXT: vpsrlq $32, %xmm2, %xmm2 1291; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1292; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1293; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 1294; AVX-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1295; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1296; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1297; AVX-NEXT: retq 1298; 1299; XOP-LABEL: vec128_i64_signed_mem_reg: 1300; XOP: # %bb.0: 1301; XOP-NEXT: vmovdqa (%rdi), %xmm1 1302; XOP-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 1303; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1304; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0 1305; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 1306; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 1307; XOP-NEXT: vpsrlq $1, %xmm0, %xmm4 1308; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 1309; XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 1310; XOP-NEXT: vpsrlq $32, %xmm2, %xmm2 1311; XOP-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1312; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1313; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1314; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1315; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1316; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1317; XOP-NEXT: retq 1318; 1319; AVX512F-LABEL: vec128_i64_signed_mem_reg: 1320; AVX512F: # %bb.0: 1321; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1322; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1323; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 1324; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1325; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1326; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1327; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 1328; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 1329; AVX512F-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1330; AVX512F-NEXT: vpsrlq $1, %xmm0, %xmm2 1331; AVX512F-NEXT: vpsrlq $33, %xmm0, %xmm0 1332; AVX512F-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 1333; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4 1334; AVX512F-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1335; AVX512F-NEXT: vpaddq %xmm0, %xmm4, %xmm0 1336; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 1337; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1338; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1339; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1340; AVX512F-NEXT: vzeroupper 1341; AVX512F-NEXT: retq 1342; 1343; AVX512VL-LABEL: vec128_i64_signed_mem_reg: 1344; AVX512VL: # %bb.0: 1345; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 1346; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 1347; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm2 1348; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0 1349; AVX512VL-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1350; AVX512VL-NEXT: vpsrlq $1, %xmm0, %xmm0 1351; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1352; AVX512VL-NEXT: vpsubq %xmm0, %xmm2, %xmm0 {%k1} 1353; AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1354; AVX512VL-NEXT: retq 1355; 1356; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_reg: 1357; AVX512BW-FALLBACK: # %bb.0: 1358; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1359; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 1360; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 1361; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1362; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1363; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1364; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 1365; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 1366; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1367; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm2 1368; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0 1369; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 1370; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 1371; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1372; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 1373; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 1374; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1375; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1376; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1377; AVX512BW-FALLBACK-NEXT: vzeroupper 1378; AVX512BW-FALLBACK-NEXT: retq 1379 %a1 = load <2 x i64>, ptr %a1_addr 1380 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed 1381 %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1> 1382 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1 1383 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2 1384 %t7 = sub <2 x i64> %t6, %t5 1385 %t8 = lshr <2 x i64> %t7, <i64 1, i64 1> 1386 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed 1387 %a10 = add nsw <2 x i64> %t9, %a1 ; signed 1388 ret <2 x i64> %a10 1389} 1390 1391define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwind { 1392; SSE2-LABEL: vec128_i64_signed_reg_mem: 1393; SSE2: # %bb.0: 1394; SSE2-NEXT: movdqa (%rdi), %xmm1 1395; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1396; SSE2-NEXT: movdqa %xmm0, %xmm3 1397; SSE2-NEXT: pxor %xmm2, %xmm3 1398; SSE2-NEXT: pxor %xmm1, %xmm2 1399; SSE2-NEXT: movdqa %xmm3, %xmm4 1400; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 1401; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1402; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 1403; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1404; SSE2-NEXT: pand %xmm5, %xmm2 1405; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 1406; SSE2-NEXT: por %xmm2, %xmm3 1407; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] 1408; SSE2-NEXT: por %xmm3, %xmm2 1409; SSE2-NEXT: movdqa %xmm0, %xmm4 1410; SSE2-NEXT: psubq %xmm1, %xmm4 1411; SSE2-NEXT: pxor %xmm3, %xmm4 1412; SSE2-NEXT: movdqa %xmm3, %xmm1 1413; SSE2-NEXT: psubq %xmm4, %xmm1 1414; SSE2-NEXT: movdqa %xmm1, %xmm4 1415; SSE2-NEXT: psrlq $1, %xmm4 1416; SSE2-NEXT: psrlq $33, %xmm1 1417; SSE2-NEXT: pmuludq %xmm2, %xmm1 1418; SSE2-NEXT: psrlq $32, %xmm3 1419; SSE2-NEXT: pmuludq %xmm4, %xmm3 1420; SSE2-NEXT: paddq %xmm1, %xmm3 1421; SSE2-NEXT: psllq $32, %xmm3 1422; SSE2-NEXT: pmuludq %xmm2, %xmm4 1423; SSE2-NEXT: paddq %xmm4, %xmm0 1424; SSE2-NEXT: paddq %xmm3, %xmm0 1425; SSE2-NEXT: retq 1426; 1427; SSE41-LABEL: vec128_i64_signed_reg_mem: 1428; SSE41: # %bb.0: 1429; SSE41-NEXT: movdqa (%rdi), %xmm1 1430; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 1431; SSE41-NEXT: movdqa %xmm0, %xmm3 1432; SSE41-NEXT: pxor %xmm2, %xmm3 1433; SSE41-NEXT: pxor %xmm1, %xmm2 1434; SSE41-NEXT: movdqa %xmm3, %xmm4 1435; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 1436; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1437; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 1438; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1439; SSE41-NEXT: pand %xmm5, %xmm2 1440; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 1441; SSE41-NEXT: por %xmm2, %xmm3 1442; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [1,1] 1443; SSE41-NEXT: por %xmm3, %xmm2 1444; SSE41-NEXT: movdqa %xmm0, %xmm4 1445; SSE41-NEXT: psubq %xmm1, %xmm4 1446; SSE41-NEXT: pxor %xmm3, %xmm4 1447; SSE41-NEXT: movdqa %xmm3, %xmm1 1448; SSE41-NEXT: psubq %xmm4, %xmm1 1449; SSE41-NEXT: movdqa %xmm1, %xmm4 1450; SSE41-NEXT: psrlq $1, %xmm4 1451; SSE41-NEXT: psrlq $33, %xmm1 1452; SSE41-NEXT: pmuludq %xmm2, %xmm1 1453; SSE41-NEXT: psrlq $32, %xmm3 1454; SSE41-NEXT: pmuludq %xmm4, %xmm3 1455; SSE41-NEXT: paddq %xmm1, %xmm3 1456; SSE41-NEXT: psllq $32, %xmm3 1457; SSE41-NEXT: pmuludq %xmm2, %xmm4 1458; SSE41-NEXT: paddq %xmm4, %xmm0 1459; SSE41-NEXT: paddq %xmm3, %xmm0 1460; SSE41-NEXT: retq 1461; 1462; AVX-LABEL: vec128_i64_signed_reg_mem: 1463; AVX: # %bb.0: 1464; AVX-NEXT: vmovdqa (%rdi), %xmm1 1465; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 1466; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1467; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1468; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 1469; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1470; AVX-NEXT: vpsrlq $1, %xmm1, %xmm4 1471; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 1472; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1473; AVX-NEXT: vpsrlq $32, %xmm2, %xmm2 1474; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1475; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1476; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 1477; AVX-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1478; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1479; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1480; AVX-NEXT: retq 1481; 1482; XOP-LABEL: vec128_i64_signed_reg_mem: 1483; XOP: # %bb.0: 1484; XOP-NEXT: vmovdqa (%rdi), %xmm1 1485; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 1486; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1487; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1488; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 1489; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1490; XOP-NEXT: vpsrlq $1, %xmm1, %xmm4 1491; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 1492; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1493; XOP-NEXT: vpsrlq $32, %xmm2, %xmm2 1494; XOP-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1495; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1496; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 1497; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1498; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1499; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1500; XOP-NEXT: retq 1501; 1502; AVX512F-LABEL: vec128_i64_signed_reg_mem: 1503; AVX512F: # %bb.0: 1504; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1505; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1506; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1507; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1508; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1509; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1510; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1511; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1512; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1513; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm2 1514; AVX512F-NEXT: vpsrlq $33, %xmm1, %xmm1 1515; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1516; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4 1517; AVX512F-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1518; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1519; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 1520; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1521; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1522; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1523; AVX512F-NEXT: vzeroupper 1524; AVX512F-NEXT: retq 1525; 1526; AVX512VL-LABEL: vec128_i64_signed_reg_mem: 1527; AVX512VL: # %bb.0: 1528; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 1529; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 1530; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2 1531; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1 1532; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1533; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 1534; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1535; AVX512VL-NEXT: vpsubq %xmm1, %xmm2, %xmm1 {%k1} 1536; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1537; AVX512VL-NEXT: retq 1538; 1539; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_mem: 1540; AVX512BW-FALLBACK: # %bb.0: 1541; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1542; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 1543; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1544; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1545; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1546; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1547; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1548; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1549; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1550; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 1551; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 1552; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1553; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 1554; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1555; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1556; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 1557; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1558; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1559; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1560; AVX512BW-FALLBACK-NEXT: vzeroupper 1561; AVX512BW-FALLBACK-NEXT: retq 1562 %a2 = load <2 x i64>, ptr %a2_addr 1563 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed 1564 %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1> 1565 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1 1566 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2 1567 %t7 = sub <2 x i64> %t6, %t5 1568 %t8 = lshr <2 x i64> %t7, <i64 1, i64 1> 1569 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed 1570 %a10 = add nsw <2 x i64> %t9, %a1 ; signed 1571 ret <2 x i64> %a10 1572} 1573 1574define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 1575; SSE2-LABEL: vec128_i64_signed_mem_mem: 1576; SSE2: # %bb.0: 1577; SSE2-NEXT: movdqa (%rdi), %xmm1 1578; SSE2-NEXT: movdqa (%rsi), %xmm0 1579; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1580; SSE2-NEXT: movdqa %xmm1, %xmm3 1581; SSE2-NEXT: psubq %xmm0, %xmm3 1582; SSE2-NEXT: pxor %xmm2, %xmm0 1583; SSE2-NEXT: pxor %xmm1, %xmm2 1584; SSE2-NEXT: movdqa %xmm2, %xmm4 1585; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 1586; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1587; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 1588; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 1589; SSE2-NEXT: pand %xmm5, %xmm0 1590; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1591; SSE2-NEXT: por %xmm0, %xmm2 1592; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] 1593; SSE2-NEXT: por %xmm2, %xmm4 1594; SSE2-NEXT: pxor %xmm2, %xmm3 1595; SSE2-NEXT: movdqa %xmm2, %xmm5 1596; SSE2-NEXT: psubq %xmm3, %xmm5 1597; SSE2-NEXT: movdqa %xmm5, %xmm0 1598; SSE2-NEXT: psrlq $1, %xmm0 1599; SSE2-NEXT: psrlq $33, %xmm5 1600; SSE2-NEXT: pmuludq %xmm4, %xmm5 1601; SSE2-NEXT: psrlq $32, %xmm2 1602; SSE2-NEXT: pmuludq %xmm0, %xmm2 1603; SSE2-NEXT: paddq %xmm5, %xmm2 1604; SSE2-NEXT: psllq $32, %xmm2 1605; SSE2-NEXT: pmuludq %xmm4, %xmm0 1606; SSE2-NEXT: paddq %xmm1, %xmm0 1607; SSE2-NEXT: paddq %xmm2, %xmm0 1608; SSE2-NEXT: retq 1609; 1610; SSE41-LABEL: vec128_i64_signed_mem_mem: 1611; SSE41: # %bb.0: 1612; SSE41-NEXT: movdqa (%rdi), %xmm1 1613; SSE41-NEXT: movdqa (%rsi), %xmm0 1614; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] 1615; SSE41-NEXT: movdqa %xmm1, %xmm3 1616; SSE41-NEXT: psubq %xmm0, %xmm3 1617; SSE41-NEXT: pxor %xmm2, %xmm0 1618; SSE41-NEXT: pxor %xmm1, %xmm2 1619; SSE41-NEXT: movdqa %xmm2, %xmm4 1620; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 1621; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1622; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 1623; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 1624; SSE41-NEXT: pand %xmm5, %xmm0 1625; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1626; SSE41-NEXT: por %xmm0, %xmm2 1627; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] 1628; SSE41-NEXT: por %xmm2, %xmm4 1629; SSE41-NEXT: pxor %xmm2, %xmm3 1630; SSE41-NEXT: movdqa %xmm2, %xmm5 1631; SSE41-NEXT: psubq %xmm3, %xmm5 1632; SSE41-NEXT: movdqa %xmm5, %xmm0 1633; SSE41-NEXT: psrlq $1, %xmm0 1634; SSE41-NEXT: psrlq $33, %xmm5 1635; SSE41-NEXT: pmuludq %xmm4, %xmm5 1636; SSE41-NEXT: psrlq $32, %xmm2 1637; SSE41-NEXT: pmuludq %xmm0, %xmm2 1638; SSE41-NEXT: paddq %xmm5, %xmm2 1639; SSE41-NEXT: psllq $32, %xmm2 1640; SSE41-NEXT: pmuludq %xmm4, %xmm0 1641; SSE41-NEXT: paddq %xmm1, %xmm0 1642; SSE41-NEXT: paddq %xmm2, %xmm0 1643; SSE41-NEXT: retq 1644; 1645; AVX-LABEL: vec128_i64_signed_mem_mem: 1646; AVX: # %bb.0: 1647; AVX-NEXT: vmovdqa (%rdi), %xmm0 1648; AVX-NEXT: vmovdqa (%rsi), %xmm1 1649; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 1650; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1651; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1652; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 1653; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1654; AVX-NEXT: vpsrlq $1, %xmm1, %xmm4 1655; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 1656; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1657; AVX-NEXT: vpsrlq $32, %xmm2, %xmm2 1658; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1659; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1660; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 1661; AVX-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1662; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1663; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1664; AVX-NEXT: retq 1665; 1666; XOP-LABEL: vec128_i64_signed_mem_mem: 1667; XOP: # %bb.0: 1668; XOP-NEXT: vmovdqa (%rdi), %xmm0 1669; XOP-NEXT: vmovdqa (%rsi), %xmm1 1670; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 1671; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1672; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1673; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 1674; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1675; XOP-NEXT: vpsrlq $1, %xmm1, %xmm4 1676; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 1677; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1678; XOP-NEXT: vpsrlq $32, %xmm2, %xmm2 1679; XOP-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 1680; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1681; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 1682; XOP-NEXT: vpmuludq %xmm3, %xmm4, %xmm2 1683; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1684; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1685; XOP-NEXT: retq 1686; 1687; AVX512F-LABEL: vec128_i64_signed_mem_mem: 1688; AVX512F: # %bb.0: 1689; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1690; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 1691; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1692; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1693; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1694; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1695; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1696; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1697; AVX512F-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1698; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm2 1699; AVX512F-NEXT: vpsrlq $33, %xmm1, %xmm1 1700; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1701; AVX512F-NEXT: vpsrlq $32, %xmm3, %xmm4 1702; AVX512F-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1703; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1704; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 1705; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1706; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1707; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1708; AVX512F-NEXT: vzeroupper 1709; AVX512F-NEXT: retq 1710; 1711; AVX512VL-LABEL: vec128_i64_signed_mem_mem: 1712; AVX512VL: # %bb.0: 1713; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 1714; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 1715; AVX512VL-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 1716; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm2 1717; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm1 1718; AVX512VL-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1719; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 1720; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1721; AVX512VL-NEXT: vpsubq %xmm1, %xmm2, %xmm1 {%k1} 1722; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1723; AVX512VL-NEXT: retq 1724; 1725; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_mem: 1726; AVX512BW-FALLBACK: # %bb.0: 1727; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 1728; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 1729; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1730; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1731; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] 1732; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1733; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1734; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1735; AVX512BW-FALLBACK-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1736; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 1737; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 1738; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1739; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 1740; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 1741; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1742; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 1743; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 1744; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1745; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1746; AVX512BW-FALLBACK-NEXT: vzeroupper 1747; AVX512BW-FALLBACK-NEXT: retq 1748 %a1 = load <2 x i64>, ptr %a1_addr 1749 %a2 = load <2 x i64>, ptr %a2_addr 1750 %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed 1751 %t4 = select <2 x i1> %t3, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 1, i64 1> 1752 %t5 = select <2 x i1> %t3, <2 x i64> %a2, <2 x i64> %a1 1753 %t6 = select <2 x i1> %t3, <2 x i64> %a1, <2 x i64> %a2 1754 %t7 = sub <2 x i64> %t6, %t5 1755 %t8 = lshr <2 x i64> %t7, <i64 1, i64 1> 1756 %t9 = mul nsw <2 x i64> %t8, %t4 ; signed 1757 %a10 = add nsw <2 x i64> %t9, %a1 ; signed 1758 ret <2 x i64> %a10 1759} 1760 1761; ---------------------------------------------------------------------------- ; 1762; 16-bit width. 128 / 16 = 8 elts. 1763; ---------------------------------------------------------------------------- ; 1764 1765; Values come from regs 1766 1767define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind { 1768; SSE-LABEL: vec128_i16_signed_reg_reg: 1769; SSE: # %bb.0: 1770; SSE-NEXT: movdqa %xmm0, %xmm2 1771; SSE-NEXT: pcmpgtw %xmm1, %xmm2 1772; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1773; SSE-NEXT: movdqa %xmm0, %xmm3 1774; SSE-NEXT: pminsw %xmm1, %xmm3 1775; SSE-NEXT: pmaxsw %xmm0, %xmm1 1776; SSE-NEXT: psubw %xmm3, %xmm1 1777; SSE-NEXT: psrlw $1, %xmm1 1778; SSE-NEXT: pmullw %xmm1, %xmm2 1779; SSE-NEXT: paddw %xmm2, %xmm0 1780; SSE-NEXT: retq 1781; 1782; AVX-LABEL: vec128_i16_signed_reg_reg: 1783; AVX: # %bb.0: 1784; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 1785; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1786; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm3 1787; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1788; AVX-NEXT: vpsubw %xmm3, %xmm1, %xmm1 1789; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 1790; AVX-NEXT: vpmullw %xmm2, %xmm1, %xmm1 1791; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1792; AVX-NEXT: retq 1793; 1794; XOP-LABEL: vec128_i16_signed_reg_reg: 1795; XOP: # %bb.0: 1796; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 1797; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1798; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 1799; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1800; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 1801; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1802; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 1803; XOP-NEXT: retq 1804; 1805; AVX512F-LABEL: vec128_i16_signed_reg_reg: 1806; AVX512F: # %bb.0: 1807; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 1808; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1809; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 1810; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1811; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 1812; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 1813; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1 1814; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1815; AVX512F-NEXT: retq 1816; 1817; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_reg: 1818; AVX512VL-FALLBACK: # %bb.0: 1819; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 1820; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 1821; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 1822; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 1823; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 1824; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 1825; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1826; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1827; AVX512VL-FALLBACK-NEXT: retq 1828; 1829; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_reg: 1830; AVX512BW-FALLBACK: # %bb.0: 1831; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1832; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1833; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 1834; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1835; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] 1836; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1837; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 1838; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1839; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1840; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 1841; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1842; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1843; AVX512BW-FALLBACK-NEXT: vzeroupper 1844; AVX512BW-FALLBACK-NEXT: retq 1845; 1846; AVX512VLBW-LABEL: vec128_i16_signed_reg_reg: 1847; AVX512VLBW: # %bb.0: 1848; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 1849; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2 1850; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1851; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1852; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 1853; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1854; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 {%k1} 1855; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1856; AVX512VLBW-NEXT: retq 1857 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed 1858 %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1859 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1 1860 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2 1861 %t7 = sub <8 x i16> %t6, %t5 1862 %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1863 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed 1864 %a10 = add nsw <8 x i16> %t9, %a1 ; signed 1865 ret <8 x i16> %a10 1866} 1867 1868define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind { 1869; SSE2-LABEL: vec128_i16_unsigned_reg_reg: 1870; SSE2: # %bb.0: 1871; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 1872; SSE2-NEXT: movdqa %xmm1, %xmm3 1873; SSE2-NEXT: pxor %xmm2, %xmm3 1874; SSE2-NEXT: pxor %xmm0, %xmm2 1875; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 1876; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1877; SSE2-NEXT: movdqa %xmm0, %xmm3 1878; SSE2-NEXT: psubusw %xmm1, %xmm3 1879; SSE2-NEXT: psubusw %xmm0, %xmm1 1880; SSE2-NEXT: por %xmm1, %xmm3 1881; SSE2-NEXT: psrlw $1, %xmm3 1882; SSE2-NEXT: pmullw %xmm2, %xmm3 1883; SSE2-NEXT: paddw %xmm3, %xmm0 1884; SSE2-NEXT: retq 1885; 1886; SSE41-LABEL: vec128_i16_unsigned_reg_reg: 1887; SSE41: # %bb.0: 1888; SSE41-NEXT: movdqa %xmm0, %xmm2 1889; SSE41-NEXT: pminuw %xmm1, %xmm2 1890; SSE41-NEXT: movdqa %xmm0, %xmm3 1891; SSE41-NEXT: pcmpeqw %xmm2, %xmm3 1892; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 1893; SSE41-NEXT: pxor %xmm3, %xmm4 1894; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 1895; SSE41-NEXT: pmaxuw %xmm0, %xmm1 1896; SSE41-NEXT: psubw %xmm2, %xmm1 1897; SSE41-NEXT: psrlw $1, %xmm1 1898; SSE41-NEXT: pmullw %xmm1, %xmm4 1899; SSE41-NEXT: paddw %xmm4, %xmm0 1900; SSE41-NEXT: retq 1901; 1902; AVX-LABEL: vec128_i16_unsigned_reg_reg: 1903; AVX: # %bb.0: 1904; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 1905; AVX-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 1906; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1907; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3 1908; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 1909; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1910; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1911; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 1912; AVX-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1913; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1914; AVX-NEXT: retq 1915; 1916; XOP-LABEL: vec128_i16_unsigned_reg_reg: 1917; XOP: # %bb.0: 1918; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2 1919; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1920; XOP-NEXT: vpminuw %xmm1, %xmm0, %xmm3 1921; XOP-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1922; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 1923; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1924; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 1925; XOP-NEXT: retq 1926; 1927; AVX512F-LABEL: vec128_i16_unsigned_reg_reg: 1928; AVX512F: # %bb.0: 1929; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm2 1930; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 1931; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 1932; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 1933; AVX512F-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1934; AVX512F-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1935; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 1936; AVX512F-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1937; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1938; AVX512F-NEXT: vzeroupper 1939; AVX512F-NEXT: retq 1940; 1941; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg: 1942; AVX512VL-FALLBACK: # %bb.0: 1943; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 1944; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1945; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1946; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 1947; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 1948; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 1949; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1 1950; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1951; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1952; AVX512VL-FALLBACK-NEXT: retq 1953; 1954; AVX512BW-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg: 1955; AVX512BW-FALLBACK: # %bb.0: 1956; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1957; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1958; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 1959; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1960; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] 1961; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1962; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 1963; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1964; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1965; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 1966; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1967; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1968; AVX512BW-FALLBACK-NEXT: vzeroupper 1969; AVX512BW-FALLBACK-NEXT: retq 1970; 1971; AVX512VLBW-LABEL: vec128_i16_unsigned_reg_reg: 1972; AVX512VLBW: # %bb.0: 1973; AVX512VLBW-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 1974; AVX512VLBW-NEXT: vpminuw %xmm1, %xmm0, %xmm2 1975; AVX512VLBW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1976; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 1977; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 1978; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1979; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 {%k1} 1980; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1981; AVX512VLBW-NEXT: retq 1982 %t3 = icmp ugt <8 x i16> %a1, %a2 1983 %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1984 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1 1985 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2 1986 %t7 = sub <8 x i16> %t6, %t5 1987 %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1988 %t9 = mul <8 x i16> %t8, %t4 1989 %a10 = add <8 x i16> %t9, %a1 1990 ret <8 x i16> %a10 1991} 1992 1993; Values are loaded. Only check signed case. 1994 1995define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwind { 1996; SSE-LABEL: vec128_i16_signed_mem_reg: 1997; SSE: # %bb.0: 1998; SSE-NEXT: movdqa (%rdi), %xmm1 1999; SSE-NEXT: movdqa %xmm1, %xmm2 2000; SSE-NEXT: pcmpgtw %xmm0, %xmm2 2001; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2002; SSE-NEXT: movdqa %xmm1, %xmm3 2003; SSE-NEXT: pminsw %xmm0, %xmm3 2004; SSE-NEXT: pmaxsw %xmm1, %xmm0 2005; SSE-NEXT: psubw %xmm3, %xmm0 2006; SSE-NEXT: psrlw $1, %xmm0 2007; SSE-NEXT: pmullw %xmm2, %xmm0 2008; SSE-NEXT: paddw %xmm1, %xmm0 2009; SSE-NEXT: retq 2010; 2011; AVX-LABEL: vec128_i16_signed_mem_reg: 2012; AVX: # %bb.0: 2013; AVX-NEXT: vmovdqa (%rdi), %xmm1 2014; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 2015; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2016; AVX-NEXT: vpminsw %xmm0, %xmm1, %xmm3 2017; AVX-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 2018; AVX-NEXT: vpsubw %xmm3, %xmm0, %xmm0 2019; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 2020; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0 2021; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2022; AVX-NEXT: retq 2023; 2024; XOP-LABEL: vec128_i16_signed_mem_reg: 2025; XOP: # %bb.0: 2026; XOP-NEXT: vmovdqa (%rdi), %xmm1 2027; XOP-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2 2028; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2029; XOP-NEXT: vpminsw %xmm0, %xmm1, %xmm3 2030; XOP-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 2031; XOP-NEXT: vpsubw %xmm3, %xmm0, %xmm0 2032; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 2033; XOP-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0 2034; XOP-NEXT: retq 2035; 2036; AVX512F-LABEL: vec128_i16_signed_mem_reg: 2037; AVX512F: # %bb.0: 2038; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 2039; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 2040; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2041; AVX512F-NEXT: vpminsw %xmm0, %xmm1, %xmm3 2042; AVX512F-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 2043; AVX512F-NEXT: vpsubw %xmm3, %xmm0, %xmm0 2044; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm0 2045; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0 2046; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2047; AVX512F-NEXT: retq 2048; 2049; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_reg: 2050; AVX512VL-FALLBACK: # %bb.0: 2051; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2052; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 2053; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm3 2054; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 2055; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 2056; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 2057; AVX512VL-FALLBACK-NEXT: vpxor %xmm0, %xmm2, %xmm2 2058; AVX512VL-FALLBACK-NEXT: vpsubw %xmm0, %xmm2, %xmm0 2059; AVX512VL-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2060; AVX512VL-FALLBACK-NEXT: retq 2061; 2062; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_reg: 2063; AVX512BW-FALLBACK: # %bb.0: 2064; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2065; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2066; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 2067; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2068; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] 2069; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 2070; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 2071; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 2072; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm0, %xmm0 2073; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 2074; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm0, %xmm0 2075; AVX512BW-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2076; AVX512BW-FALLBACK-NEXT: vzeroupper 2077; AVX512BW-FALLBACK-NEXT: retq 2078; 2079; AVX512VLBW-LABEL: vec128_i16_signed_mem_reg: 2080; AVX512VLBW: # %bb.0: 2081; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1 2082; AVX512VLBW-NEXT: vpcmpgtw %xmm0, %xmm1, %k1 2083; AVX512VLBW-NEXT: vpminsw %xmm0, %xmm1, %xmm2 2084; AVX512VLBW-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 2085; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm0, %xmm0 2086; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm0 2087; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2088; AVX512VLBW-NEXT: vpsubw %xmm0, %xmm2, %xmm0 {%k1} 2089; AVX512VLBW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2090; AVX512VLBW-NEXT: retq 2091 %a1 = load <8 x i16>, ptr %a1_addr 2092 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed 2093 %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2094 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1 2095 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2 2096 %t7 = sub <8 x i16> %t6, %t5 2097 %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2098 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed 2099 %a10 = add nsw <8 x i16> %t9, %a1 ; signed 2100 ret <8 x i16> %a10 2101} 2102 2103define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwind { 2104; SSE-LABEL: vec128_i16_signed_reg_mem: 2105; SSE: # %bb.0: 2106; SSE-NEXT: movdqa (%rdi), %xmm1 2107; SSE-NEXT: movdqa %xmm0, %xmm2 2108; SSE-NEXT: pcmpgtw %xmm1, %xmm2 2109; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2110; SSE-NEXT: movdqa %xmm0, %xmm3 2111; SSE-NEXT: pminsw %xmm1, %xmm3 2112; SSE-NEXT: pmaxsw %xmm0, %xmm1 2113; SSE-NEXT: psubw %xmm3, %xmm1 2114; SSE-NEXT: psrlw $1, %xmm1 2115; SSE-NEXT: pmullw %xmm2, %xmm1 2116; SSE-NEXT: paddw %xmm1, %xmm0 2117; SSE-NEXT: retq 2118; 2119; AVX-LABEL: vec128_i16_signed_reg_mem: 2120; AVX: # %bb.0: 2121; AVX-NEXT: vmovdqa (%rdi), %xmm1 2122; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 2123; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2124; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2125; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2126; AVX-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2127; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 2128; AVX-NEXT: vpmullw %xmm2, %xmm1, %xmm1 2129; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2130; AVX-NEXT: retq 2131; 2132; XOP-LABEL: vec128_i16_signed_reg_mem: 2133; XOP: # %bb.0: 2134; XOP-NEXT: vmovdqa (%rdi), %xmm1 2135; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 2136; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2137; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2138; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2139; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2140; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 2141; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 2142; XOP-NEXT: retq 2143; 2144; AVX512F-LABEL: vec128_i16_signed_reg_mem: 2145; AVX512F: # %bb.0: 2146; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 2147; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 2148; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2149; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2150; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2151; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2152; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 2153; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1 2154; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2155; AVX512F-NEXT: retq 2156; 2157; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_mem: 2158; AVX512VL-FALLBACK: # %bb.0: 2159; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2160; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2161; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 2162; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 2163; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 2164; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 2165; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 2166; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 2167; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2168; AVX512VL-FALLBACK-NEXT: retq 2169; 2170; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_reg_mem: 2171; AVX512BW-FALLBACK: # %bb.0: 2172; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2173; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2174; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 2175; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2176; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] 2177; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 2178; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2179; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2180; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 2181; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 2182; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2183; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2184; AVX512BW-FALLBACK-NEXT: vzeroupper 2185; AVX512BW-FALLBACK-NEXT: retq 2186; 2187; AVX512VLBW-LABEL: vec128_i16_signed_reg_mem: 2188; AVX512VLBW: # %bb.0: 2189; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1 2190; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 2191; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2192; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2193; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 2194; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 2195; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2196; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 {%k1} 2197; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2198; AVX512VLBW-NEXT: retq 2199 %a2 = load <8 x i16>, ptr %a2_addr 2200 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed 2201 %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2202 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1 2203 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2 2204 %t7 = sub <8 x i16> %t6, %t5 2205 %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2206 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed 2207 %a10 = add nsw <8 x i16> %t9, %a1 ; signed 2208 ret <8 x i16> %a10 2209} 2210 2211define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 2212; SSE-LABEL: vec128_i16_signed_mem_mem: 2213; SSE: # %bb.0: 2214; SSE-NEXT: movdqa (%rdi), %xmm1 2215; SSE-NEXT: movdqa (%rsi), %xmm0 2216; SSE-NEXT: movdqa %xmm1, %xmm2 2217; SSE-NEXT: pcmpgtw %xmm0, %xmm2 2218; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2219; SSE-NEXT: movdqa %xmm1, %xmm3 2220; SSE-NEXT: pminsw %xmm0, %xmm3 2221; SSE-NEXT: pmaxsw %xmm1, %xmm0 2222; SSE-NEXT: psubw %xmm3, %xmm0 2223; SSE-NEXT: psrlw $1, %xmm0 2224; SSE-NEXT: pmullw %xmm2, %xmm0 2225; SSE-NEXT: paddw %xmm1, %xmm0 2226; SSE-NEXT: retq 2227; 2228; AVX-LABEL: vec128_i16_signed_mem_mem: 2229; AVX: # %bb.0: 2230; AVX-NEXT: vmovdqa (%rdi), %xmm0 2231; AVX-NEXT: vmovdqa (%rsi), %xmm1 2232; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 2233; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2234; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2235; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2236; AVX-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2237; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 2238; AVX-NEXT: vpmullw %xmm2, %xmm1, %xmm1 2239; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2240; AVX-NEXT: retq 2241; 2242; XOP-LABEL: vec128_i16_signed_mem_mem: 2243; XOP: # %bb.0: 2244; XOP-NEXT: vmovdqa (%rdi), %xmm0 2245; XOP-NEXT: vmovdqa (%rsi), %xmm1 2246; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 2247; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2248; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2249; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2250; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2251; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 2252; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 2253; XOP-NEXT: retq 2254; 2255; AVX512F-LABEL: vec128_i16_signed_mem_mem: 2256; AVX512F: # %bb.0: 2257; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2258; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 2259; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 2260; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2261; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 2262; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2263; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 2264; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 2265; AVX512F-NEXT: vpmullw %xmm2, %xmm1, %xmm1 2266; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2267; AVX512F-NEXT: retq 2268; 2269; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_mem: 2270; AVX512VL-FALLBACK: # %bb.0: 2271; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 2272; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 2273; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2274; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 2275; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 2276; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 2277; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 2278; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 2279; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 2280; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2281; AVX512VL-FALLBACK-NEXT: retq 2282; 2283; AVX512BW-FALLBACK-LABEL: vec128_i16_signed_mem_mem: 2284; AVX512BW-FALLBACK: # %bb.0: 2285; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 2286; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 2287; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 2288; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2289; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] 2290; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 2291; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2292; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2293; AVX512BW-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 2294; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 2295; AVX512BW-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2296; AVX512BW-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2297; AVX512BW-FALLBACK-NEXT: vzeroupper 2298; AVX512BW-FALLBACK-NEXT: retq 2299; 2300; AVX512VLBW-LABEL: vec128_i16_signed_mem_mem: 2301; AVX512VLBW: # %bb.0: 2302; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm0 2303; AVX512VLBW-NEXT: vmovdqa (%rsi), %xmm1 2304; AVX512VLBW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 2305; AVX512VLBW-NEXT: vpminsw %xmm1, %xmm0, %xmm2 2306; AVX512VLBW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 2307; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 2308; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 2309; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2310; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 {%k1} 2311; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2312; AVX512VLBW-NEXT: retq 2313 %a1 = load <8 x i16>, ptr %a1_addr 2314 %a2 = load <8 x i16>, ptr %a2_addr 2315 %t3 = icmp sgt <8 x i16> %a1, %a2 ; signed 2316 %t4 = select <8 x i1> %t3, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2317 %t5 = select <8 x i1> %t3, <8 x i16> %a2, <8 x i16> %a1 2318 %t6 = select <8 x i1> %t3, <8 x i16> %a1, <8 x i16> %a2 2319 %t7 = sub <8 x i16> %t6, %t5 2320 %t8 = lshr <8 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2321 %t9 = mul nsw <8 x i16> %t8, %t4 ; signed 2322 %a10 = add nsw <8 x i16> %t9, %a1 ; signed 2323 ret <8 x i16> %a10 2324} 2325 2326; ---------------------------------------------------------------------------- ; 2327; 8-bit width. 128 / 8 = 16 elts. 2328; ---------------------------------------------------------------------------- ; 2329 2330; Values come from regs 2331 2332define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind { 2333; SSE2-LABEL: vec128_i8_signed_reg_reg: 2334; SSE2: # %bb.0: 2335; SSE2-NEXT: movdqa %xmm0, %xmm3 2336; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 2337; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2338; SSE2-NEXT: por %xmm3, %xmm2 2339; SSE2-NEXT: movdqa %xmm0, %xmm4 2340; SSE2-NEXT: psubb %xmm1, %xmm4 2341; SSE2-NEXT: pxor %xmm3, %xmm4 2342; SSE2-NEXT: psubb %xmm4, %xmm3 2343; SSE2-NEXT: psrlw $1, %xmm3 2344; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2345; SSE2-NEXT: movdqa %xmm3, %xmm1 2346; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2347; SSE2-NEXT: movdqa %xmm2, %xmm4 2348; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2349; SSE2-NEXT: pmullw %xmm1, %xmm4 2350; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2351; SSE2-NEXT: pand %xmm1, %xmm4 2352; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2353; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2354; SSE2-NEXT: pmullw %xmm3, %xmm2 2355; SSE2-NEXT: pand %xmm1, %xmm2 2356; SSE2-NEXT: packuswb %xmm4, %xmm2 2357; SSE2-NEXT: paddb %xmm2, %xmm0 2358; SSE2-NEXT: retq 2359; 2360; SSE41-LABEL: vec128_i8_signed_reg_reg: 2361; SSE41: # %bb.0: 2362; SSE41-NEXT: movdqa %xmm0, %xmm2 2363; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 2364; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2365; SSE41-NEXT: movdqa %xmm0, %xmm3 2366; SSE41-NEXT: pminsb %xmm1, %xmm3 2367; SSE41-NEXT: pmaxsb %xmm0, %xmm1 2368; SSE41-NEXT: psubb %xmm3, %xmm1 2369; SSE41-NEXT: psrlw $1, %xmm1 2370; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2371; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2372; SSE41-NEXT: movdqa %xmm2, %xmm4 2373; SSE41-NEXT: pand %xmm3, %xmm4 2374; SSE41-NEXT: movdqa %xmm1, %xmm5 2375; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 2376; SSE41-NEXT: pand %xmm3, %xmm5 2377; SSE41-NEXT: pandn %xmm2, %xmm3 2378; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 2379; SSE41-NEXT: psllw $8, %xmm1 2380; SSE41-NEXT: por %xmm1, %xmm5 2381; SSE41-NEXT: paddb %xmm5, %xmm0 2382; SSE41-NEXT: retq 2383; 2384; AVX1-LABEL: vec128_i8_signed_reg_reg: 2385; AVX1: # %bb.0: 2386; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 2387; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2388; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2389; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2390; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2391; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2392; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2393; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2394; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 2395; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2396; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 2397; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 2398; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2399; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2400; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2401; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2402; AVX1-NEXT: retq 2403; 2404; AVX2-LABEL: vec128_i8_signed_reg_reg: 2405; AVX2: # %bb.0: 2406; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 2407; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2408; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2409; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2410; AVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2411; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 2412; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2413; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2414; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2415; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2416; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2417; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2418; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2419; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2420; AVX2-NEXT: vzeroupper 2421; AVX2-NEXT: retq 2422; 2423; XOP-FALLBACK-LABEL: vec128_i8_signed_reg_reg: 2424; XOP-FALLBACK: # %bb.0: 2425; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 2426; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2427; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2428; XOP-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2429; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2430; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2431; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2432; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2433; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 2434; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2435; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 2436; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2437; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 2438; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2439; XOP-FALLBACK-NEXT: retq 2440; 2441; XOPAVX1-LABEL: vec128_i8_signed_reg_reg: 2442; XOPAVX1: # %bb.0: 2443; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 2444; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2445; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2446; XOPAVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2447; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2448; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2449; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2450; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2451; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 2452; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2453; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2454; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2455; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 2456; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2457; XOPAVX1-NEXT: retq 2458; 2459; XOPAVX2-LABEL: vec128_i8_signed_reg_reg: 2460; XOPAVX2: # %bb.0: 2461; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 2462; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2463; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2464; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2465; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2466; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2467; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2468; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2469; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2470; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2471; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2472; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2473; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2474; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2475; XOPAVX2-NEXT: vzeroupper 2476; XOPAVX2-NEXT: retq 2477; 2478; AVX512F-LABEL: vec128_i8_signed_reg_reg: 2479; AVX512F: # %bb.0: 2480; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 2481; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2482; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 2483; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2484; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2485; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 2486; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2487; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2488; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2489; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2490; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2491; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2492; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2493; AVX512F-NEXT: vzeroupper 2494; AVX512F-NEXT: retq 2495; 2496; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_reg: 2497; AVX512VL-FALLBACK: # %bb.0: 2498; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 2499; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 2500; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 2501; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 2502; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 2503; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) 2504; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 2505; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2506; AVX512VL-FALLBACK-NEXT: retq 2507; 2508; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_reg: 2509; AVX512BW-FALLBACK: # %bb.0: 2510; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2511; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2512; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 2513; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2514; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2515; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2516; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 2517; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2518; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2519; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 2520; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2521; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2522; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2523; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2524; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2525; AVX512BW-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2526; AVX512BW-FALLBACK-NEXT: vzeroupper 2527; AVX512BW-FALLBACK-NEXT: retq 2528; 2529; AVX512VLBW-LABEL: vec128_i8_signed_reg_reg: 2530; AVX512VLBW: # %bb.0: 2531; AVX512VLBW-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 2532; AVX512VLBW-NEXT: vpminsb %xmm1, %xmm0, %xmm2 2533; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2534; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2535; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 2536; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 2537; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2538; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} 2539; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2540; AVX512VLBW-NEXT: retq 2541 %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed 2542 %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2543 %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1 2544 %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2 2545 %t7 = sub <16 x i8> %t6, %t5 2546 %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2547 %t9 = mul nsw <16 x i8> %t8, %t4 ; signed 2548 %a10 = add nsw <16 x i8> %t9, %a1 ; signed 2549 ret <16 x i8> %a10 2550} 2551 2552define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind { 2553; SSE2-LABEL: vec128_i8_unsigned_reg_reg: 2554; SSE2: # %bb.0: 2555; SSE2-NEXT: movdqa %xmm0, %xmm3 2556; SSE2-NEXT: pminub %xmm1, %xmm3 2557; SSE2-NEXT: movdqa %xmm0, %xmm4 2558; SSE2-NEXT: pcmpeqb %xmm3, %xmm4 2559; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 2560; SSE2-NEXT: pxor %xmm4, %xmm2 2561; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2562; SSE2-NEXT: pmaxub %xmm0, %xmm1 2563; SSE2-NEXT: psubb %xmm3, %xmm1 2564; SSE2-NEXT: psrlw $1, %xmm1 2565; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2566; SSE2-NEXT: movdqa %xmm1, %xmm3 2567; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2568; SSE2-NEXT: movdqa %xmm2, %xmm4 2569; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2570; SSE2-NEXT: pmullw %xmm3, %xmm4 2571; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2572; SSE2-NEXT: pand %xmm3, %xmm4 2573; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2574; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2575; SSE2-NEXT: pmullw %xmm1, %xmm2 2576; SSE2-NEXT: pand %xmm3, %xmm2 2577; SSE2-NEXT: packuswb %xmm4, %xmm2 2578; SSE2-NEXT: paddb %xmm2, %xmm0 2579; SSE2-NEXT: retq 2580; 2581; SSE41-LABEL: vec128_i8_unsigned_reg_reg: 2582; SSE41: # %bb.0: 2583; SSE41-NEXT: movdqa %xmm0, %xmm2 2584; SSE41-NEXT: pminub %xmm1, %xmm2 2585; SSE41-NEXT: movdqa %xmm0, %xmm3 2586; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 2587; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 2588; SSE41-NEXT: pxor %xmm3, %xmm4 2589; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2590; SSE41-NEXT: pmaxub %xmm0, %xmm1 2591; SSE41-NEXT: psubb %xmm2, %xmm1 2592; SSE41-NEXT: psrlw $1, %xmm1 2593; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2594; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2595; SSE41-NEXT: movdqa %xmm4, %xmm3 2596; SSE41-NEXT: pand %xmm2, %xmm3 2597; SSE41-NEXT: movdqa %xmm1, %xmm5 2598; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 2599; SSE41-NEXT: pand %xmm2, %xmm5 2600; SSE41-NEXT: pandn %xmm4, %xmm2 2601; SSE41-NEXT: pmaddubsw %xmm2, %xmm1 2602; SSE41-NEXT: psllw $8, %xmm1 2603; SSE41-NEXT: por %xmm1, %xmm5 2604; SSE41-NEXT: paddb %xmm5, %xmm0 2605; SSE41-NEXT: retq 2606; 2607; AVX1-LABEL: vec128_i8_unsigned_reg_reg: 2608; AVX1: # %bb.0: 2609; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm2 2610; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 2611; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2612; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 2613; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2614; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2615; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2616; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2617; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2618; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2619; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 2620; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2621; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 2622; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 2623; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2624; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2625; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2626; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2627; AVX1-NEXT: retq 2628; 2629; AVX2-LABEL: vec128_i8_unsigned_reg_reg: 2630; AVX2: # %bb.0: 2631; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm2 2632; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 2633; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2634; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 2635; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2636; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2637; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2638; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 2639; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2640; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2641; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2642; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2643; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2644; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2645; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2646; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2647; AVX2-NEXT: vzeroupper 2648; AVX2-NEXT: retq 2649; 2650; XOP-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: 2651; XOP-FALLBACK: # %bb.0: 2652; XOP-FALLBACK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm2 2653; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2654; XOP-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm3 2655; XOP-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2656; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2657; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2658; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2659; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2660; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 2661; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2662; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 2663; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2664; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 2665; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2666; XOP-FALLBACK-NEXT: retq 2667; 2668; XOPAVX1-LABEL: vec128_i8_unsigned_reg_reg: 2669; XOPAVX1: # %bb.0: 2670; XOPAVX1-NEXT: vpcomgtub %xmm1, %xmm0, %xmm2 2671; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2672; XOPAVX1-NEXT: vpminub %xmm1, %xmm0, %xmm3 2673; XOPAVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2674; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2675; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2676; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2677; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2678; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 2679; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 2680; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2681; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 2682; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 2683; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2684; XOPAVX1-NEXT: retq 2685; 2686; XOPAVX2-LABEL: vec128_i8_unsigned_reg_reg: 2687; XOPAVX2: # %bb.0: 2688; XOPAVX2-NEXT: vpcomgtub %xmm1, %xmm0, %xmm2 2689; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2690; XOPAVX2-NEXT: vpminub %xmm1, %xmm0, %xmm3 2691; XOPAVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2692; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2693; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2694; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2695; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2696; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2697; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2698; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2699; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2700; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2701; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2702; XOPAVX2-NEXT: vzeroupper 2703; XOPAVX2-NEXT: retq 2704; 2705; AVX512F-LABEL: vec128_i8_unsigned_reg_reg: 2706; AVX512F: # %bb.0: 2707; AVX512F-NEXT: vpminub %xmm1, %xmm0, %xmm2 2708; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 2709; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 2710; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2711; AVX512F-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2712; AVX512F-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2713; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 2714; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2715; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2716; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2717; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2718; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2719; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2720; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2721; AVX512F-NEXT: vzeroupper 2722; AVX512F-NEXT: retq 2723; 2724; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: 2725; AVX512VL-FALLBACK: # %bb.0: 2726; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 2727; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2728; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2729; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 2730; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 2731; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 2732; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem) 2733; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2734; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2735; AVX512VL-FALLBACK-NEXT: retq 2736; 2737; AVX512BW-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: 2738; AVX512BW-FALLBACK: # %bb.0: 2739; AVX512BW-FALLBACK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2740; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2741; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 2742; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2743; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2744; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2745; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 2746; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2747; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2748; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 2749; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2750; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2751; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2752; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2753; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2754; AVX512BW-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2755; AVX512BW-FALLBACK-NEXT: vzeroupper 2756; AVX512BW-FALLBACK-NEXT: retq 2757; 2758; AVX512VLBW-LABEL: vec128_i8_unsigned_reg_reg: 2759; AVX512VLBW: # %bb.0: 2760; AVX512VLBW-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 2761; AVX512VLBW-NEXT: vpminub %xmm1, %xmm0, %xmm2 2762; AVX512VLBW-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2763; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2764; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 2765; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 2766; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2767; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} 2768; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2769; AVX512VLBW-NEXT: retq 2770 %t3 = icmp ugt <16 x i8> %a1, %a2 2771 %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2772 %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1 2773 %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2 2774 %t7 = sub <16 x i8> %t6, %t5 2775 %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2776 %t9 = mul <16 x i8> %t8, %t4 2777 %a10 = add <16 x i8> %t9, %a1 2778 ret <16 x i8> %a10 2779} 2780 2781; Values are loaded. Only check signed case. 2782 2783define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind { 2784; SSE2-LABEL: vec128_i8_signed_mem_reg: 2785; SSE2: # %bb.0: 2786; SSE2-NEXT: movdqa %xmm0, %xmm1 2787; SSE2-NEXT: movdqa (%rdi), %xmm2 2788; SSE2-NEXT: movdqa %xmm2, %xmm3 2789; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 2790; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2791; SSE2-NEXT: por %xmm3, %xmm0 2792; SSE2-NEXT: movdqa %xmm2, %xmm4 2793; SSE2-NEXT: psubb %xmm1, %xmm4 2794; SSE2-NEXT: pxor %xmm3, %xmm4 2795; SSE2-NEXT: psubb %xmm4, %xmm3 2796; SSE2-NEXT: psrlw $1, %xmm3 2797; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2798; SSE2-NEXT: movdqa %xmm3, %xmm1 2799; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2800; SSE2-NEXT: movdqa %xmm0, %xmm4 2801; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2802; SSE2-NEXT: pmullw %xmm1, %xmm4 2803; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2804; SSE2-NEXT: pand %xmm1, %xmm4 2805; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2806; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2807; SSE2-NEXT: pmullw %xmm3, %xmm0 2808; SSE2-NEXT: pand %xmm1, %xmm0 2809; SSE2-NEXT: packuswb %xmm4, %xmm0 2810; SSE2-NEXT: paddb %xmm2, %xmm0 2811; SSE2-NEXT: retq 2812; 2813; SSE41-LABEL: vec128_i8_signed_mem_reg: 2814; SSE41: # %bb.0: 2815; SSE41-NEXT: movdqa (%rdi), %xmm1 2816; SSE41-NEXT: movdqa %xmm1, %xmm2 2817; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 2818; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2819; SSE41-NEXT: movdqa %xmm1, %xmm3 2820; SSE41-NEXT: pminsb %xmm0, %xmm3 2821; SSE41-NEXT: pmaxsb %xmm1, %xmm0 2822; SSE41-NEXT: psubb %xmm3, %xmm0 2823; SSE41-NEXT: psrlw $1, %xmm0 2824; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2825; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2826; SSE41-NEXT: movdqa %xmm2, %xmm4 2827; SSE41-NEXT: pand %xmm3, %xmm4 2828; SSE41-NEXT: movdqa %xmm0, %xmm5 2829; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 2830; SSE41-NEXT: pand %xmm3, %xmm5 2831; SSE41-NEXT: pandn %xmm2, %xmm3 2832; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 2833; SSE41-NEXT: psllw $8, %xmm0 2834; SSE41-NEXT: por %xmm5, %xmm0 2835; SSE41-NEXT: paddb %xmm1, %xmm0 2836; SSE41-NEXT: retq 2837; 2838; AVX1-LABEL: vec128_i8_signed_mem_reg: 2839; AVX1: # %bb.0: 2840; AVX1-NEXT: vmovdqa (%rdi), %xmm1 2841; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2842; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2843; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2844; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2845; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2846; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 2847; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2848; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2849; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 2850; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 2851; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 2852; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 2853; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 2854; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 2855; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 2856; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2857; AVX1-NEXT: retq 2858; 2859; AVX2-LABEL: vec128_i8_signed_mem_reg: 2860; AVX2: # %bb.0: 2861; AVX2-NEXT: vmovdqa (%rdi), %xmm1 2862; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2863; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2864; AVX2-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2865; AVX2-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2866; AVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2867; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 2868; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2869; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2870; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2871; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2872; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2873; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2874; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2875; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2876; AVX2-NEXT: vzeroupper 2877; AVX2-NEXT: retq 2878; 2879; XOP-FALLBACK-LABEL: vec128_i8_signed_mem_reg: 2880; XOP-FALLBACK: # %bb.0: 2881; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2882; XOP-FALLBACK-NEXT: vpcomgtb %xmm0, %xmm1, %xmm2 2883; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2884; XOP-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2885; XOP-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2886; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2887; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2888; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2889; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2890; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 2891; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 2892; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 2893; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 2894; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] 2895; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2896; XOP-FALLBACK-NEXT: retq 2897; 2898; XOPAVX1-LABEL: vec128_i8_signed_mem_reg: 2899; XOPAVX1: # %bb.0: 2900; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 2901; XOPAVX1-NEXT: vpcomgtb %xmm0, %xmm1, %xmm2 2902; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2903; XOPAVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2904; XOPAVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2905; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2906; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2907; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2908; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2909; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 2910; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 2911; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2912; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 2913; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] 2914; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2915; XOPAVX1-NEXT: retq 2916; 2917; XOPAVX2-LABEL: vec128_i8_signed_mem_reg: 2918; XOPAVX2: # %bb.0: 2919; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 2920; XOPAVX2-NEXT: vpcomgtb %xmm0, %xmm1, %xmm2 2921; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2922; XOPAVX2-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2923; XOPAVX2-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2924; XOPAVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2925; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2926; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2927; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2928; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2929; XOPAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2930; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2931; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2932; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2933; XOPAVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2934; XOPAVX2-NEXT: vzeroupper 2935; XOPAVX2-NEXT: retq 2936; 2937; AVX512F-LABEL: vec128_i8_signed_mem_reg: 2938; AVX512F: # %bb.0: 2939; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 2940; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2941; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2942; AVX512F-NEXT: vpminsb %xmm0, %xmm1, %xmm3 2943; AVX512F-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2944; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2945; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm0 2946; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2947; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2948; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2949; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2950; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2951; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2952; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2953; AVX512F-NEXT: vzeroupper 2954; AVX512F-NEXT: retq 2955; 2956; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_reg: 2957; AVX512VL-FALLBACK: # %bb.0: 2958; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2959; AVX512VL-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 2960; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm3 2961; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 2962; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 2963; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 2964; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem) 2965; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 2966; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2967; AVX512VL-FALLBACK-NEXT: retq 2968; 2969; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_reg: 2970; AVX512BW-FALLBACK: # %bb.0: 2971; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2972; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 2973; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 2974; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2975; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2976; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2977; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 2978; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2979; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm0, %xmm0 2980; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 2981; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2982; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2983; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2984; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2985; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm0, %ymm0 2986; AVX512BW-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2987; AVX512BW-FALLBACK-NEXT: vzeroupper 2988; AVX512BW-FALLBACK-NEXT: retq 2989; 2990; AVX512VLBW-LABEL: vec128_i8_signed_mem_reg: 2991; AVX512VLBW: # %bb.0: 2992; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1 2993; AVX512VLBW-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 2994; AVX512VLBW-NEXT: vpminsb %xmm0, %xmm1, %xmm2 2995; AVX512VLBW-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2996; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 2997; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm0 2998; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 2999; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3000; AVX512VLBW-NEXT: vpsubb %xmm0, %xmm2, %xmm0 {%k1} 3001; AVX512VLBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3002; AVX512VLBW-NEXT: retq 3003 %a1 = load <16 x i8>, ptr %a1_addr 3004 %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed 3005 %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3006 %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1 3007 %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2 3008 %t7 = sub <16 x i8> %t6, %t5 3009 %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3010 %t9 = mul nsw <16 x i8> %t8, %t4 ; signed 3011 %a10 = add nsw <16 x i8> %t9, %a1 ; signed 3012 ret <16 x i8> %a10 3013} 3014 3015define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind { 3016; SSE2-LABEL: vec128_i8_signed_reg_mem: 3017; SSE2: # %bb.0: 3018; SSE2-NEXT: movdqa (%rdi), %xmm2 3019; SSE2-NEXT: movdqa %xmm0, %xmm3 3020; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 3021; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3022; SSE2-NEXT: por %xmm3, %xmm1 3023; SSE2-NEXT: movdqa %xmm0, %xmm4 3024; SSE2-NEXT: psubb %xmm2, %xmm4 3025; SSE2-NEXT: pxor %xmm3, %xmm4 3026; SSE2-NEXT: psubb %xmm4, %xmm3 3027; SSE2-NEXT: psrlw $1, %xmm3 3028; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 3029; SSE2-NEXT: movdqa %xmm3, %xmm2 3030; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3031; SSE2-NEXT: movdqa %xmm1, %xmm4 3032; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3033; SSE2-NEXT: pmullw %xmm2, %xmm4 3034; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 3035; SSE2-NEXT: pand %xmm2, %xmm4 3036; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3037; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3038; SSE2-NEXT: pmullw %xmm3, %xmm1 3039; SSE2-NEXT: pand %xmm2, %xmm1 3040; SSE2-NEXT: packuswb %xmm4, %xmm1 3041; SSE2-NEXT: paddb %xmm1, %xmm0 3042; SSE2-NEXT: retq 3043; 3044; SSE41-LABEL: vec128_i8_signed_reg_mem: 3045; SSE41: # %bb.0: 3046; SSE41-NEXT: movdqa (%rdi), %xmm1 3047; SSE41-NEXT: movdqa %xmm0, %xmm2 3048; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 3049; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 3050; SSE41-NEXT: movdqa %xmm0, %xmm3 3051; SSE41-NEXT: pminsb %xmm1, %xmm3 3052; SSE41-NEXT: pmaxsb %xmm0, %xmm1 3053; SSE41-NEXT: psubb %xmm3, %xmm1 3054; SSE41-NEXT: psrlw $1, %xmm1 3055; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3056; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3057; SSE41-NEXT: movdqa %xmm2, %xmm4 3058; SSE41-NEXT: pand %xmm3, %xmm4 3059; SSE41-NEXT: movdqa %xmm1, %xmm5 3060; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 3061; SSE41-NEXT: pand %xmm3, %xmm5 3062; SSE41-NEXT: pandn %xmm2, %xmm3 3063; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 3064; SSE41-NEXT: psllw $8, %xmm1 3065; SSE41-NEXT: por %xmm5, %xmm1 3066; SSE41-NEXT: paddb %xmm1, %xmm0 3067; SSE41-NEXT: retq 3068; 3069; AVX1-LABEL: vec128_i8_signed_reg_mem: 3070; AVX1: # %bb.0: 3071; AVX1-NEXT: vmovdqa (%rdi), %xmm1 3072; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3073; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3074; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3075; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3076; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3077; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 3078; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3079; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3080; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 3081; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3082; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 3083; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 3084; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3085; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 3086; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 3087; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3088; AVX1-NEXT: retq 3089; 3090; AVX2-LABEL: vec128_i8_signed_reg_mem: 3091; AVX2: # %bb.0: 3092; AVX2-NEXT: vmovdqa (%rdi), %xmm1 3093; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3094; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3095; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3096; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3097; AVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3098; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 3099; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3100; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3101; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3102; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3103; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3104; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3105; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3106; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3107; AVX2-NEXT: vzeroupper 3108; AVX2-NEXT: retq 3109; 3110; XOP-FALLBACK-LABEL: vec128_i8_signed_reg_mem: 3111; XOP-FALLBACK: # %bb.0: 3112; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 3113; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3114; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3115; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3116; XOP-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3117; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3118; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3119; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3120; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3121; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 3122; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3123; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 3124; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3125; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 3126; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3127; XOP-FALLBACK-NEXT: retq 3128; 3129; XOPAVX1-LABEL: vec128_i8_signed_reg_mem: 3130; XOPAVX1: # %bb.0: 3131; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 3132; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3133; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3134; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3135; XOPAVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3136; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3137; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3138; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3139; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3140; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 3141; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3142; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 3143; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3144; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 3145; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3146; XOPAVX1-NEXT: retq 3147; 3148; XOPAVX2-LABEL: vec128_i8_signed_reg_mem: 3149; XOPAVX2: # %bb.0: 3150; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 3151; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3152; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3153; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3154; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3155; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3156; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3157; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3158; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3159; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3160; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3161; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3162; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3163; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3164; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3165; XOPAVX2-NEXT: vzeroupper 3166; XOPAVX2-NEXT: retq 3167; 3168; AVX512F-LABEL: vec128_i8_signed_reg_mem: 3169; AVX512F: # %bb.0: 3170; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 3171; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3172; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3173; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3174; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3175; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3176; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 3177; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3178; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3179; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3180; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3181; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 3182; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 3183; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3184; AVX512F-NEXT: vzeroupper 3185; AVX512F-NEXT: retq 3186; 3187; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_mem: 3188; AVX512VL-FALLBACK: # %bb.0: 3189; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 3190; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3191; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 3192; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 3193; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 3194; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 3195; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) 3196; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 3197; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3198; AVX512VL-FALLBACK-NEXT: retq 3199; 3200; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_mem: 3201; AVX512BW-FALLBACK: # %bb.0: 3202; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 3203; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 3204; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 3205; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 3206; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3207; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 3208; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3209; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3210; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 3211; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 3212; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3213; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3214; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 3215; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3216; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 3217; AVX512BW-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3218; AVX512BW-FALLBACK-NEXT: vzeroupper 3219; AVX512BW-FALLBACK-NEXT: retq 3220; 3221; AVX512VLBW-LABEL: vec128_i8_signed_reg_mem: 3222; AVX512VLBW: # %bb.0: 3223; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm1 3224; AVX512VLBW-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 3225; AVX512VLBW-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3226; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3227; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 3228; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 3229; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 3230; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3231; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} 3232; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3233; AVX512VLBW-NEXT: retq 3234 %a2 = load <16 x i8>, ptr %a2_addr 3235 %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed 3236 %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3237 %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1 3238 %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2 3239 %t7 = sub <16 x i8> %t6, %t5 3240 %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3241 %t9 = mul nsw <16 x i8> %t8, %t4 ; signed 3242 %a10 = add nsw <16 x i8> %t9, %a1 ; signed 3243 ret <16 x i8> %a10 3244} 3245 3246define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 3247; SSE2-LABEL: vec128_i8_signed_mem_mem: 3248; SSE2: # %bb.0: 3249; SSE2-NEXT: movdqa (%rdi), %xmm1 3250; SSE2-NEXT: movdqa (%rsi), %xmm2 3251; SSE2-NEXT: movdqa %xmm1, %xmm3 3252; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 3253; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3254; SSE2-NEXT: por %xmm3, %xmm0 3255; SSE2-NEXT: movdqa %xmm1, %xmm4 3256; SSE2-NEXT: psubb %xmm2, %xmm4 3257; SSE2-NEXT: pxor %xmm3, %xmm4 3258; SSE2-NEXT: psubb %xmm4, %xmm3 3259; SSE2-NEXT: psrlw $1, %xmm3 3260; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 3261; SSE2-NEXT: movdqa %xmm3, %xmm2 3262; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3263; SSE2-NEXT: movdqa %xmm0, %xmm4 3264; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3265; SSE2-NEXT: pmullw %xmm2, %xmm4 3266; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 3267; SSE2-NEXT: pand %xmm2, %xmm4 3268; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3269; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3270; SSE2-NEXT: pmullw %xmm3, %xmm0 3271; SSE2-NEXT: pand %xmm2, %xmm0 3272; SSE2-NEXT: packuswb %xmm4, %xmm0 3273; SSE2-NEXT: paddb %xmm1, %xmm0 3274; SSE2-NEXT: retq 3275; 3276; SSE41-LABEL: vec128_i8_signed_mem_mem: 3277; SSE41: # %bb.0: 3278; SSE41-NEXT: movdqa (%rdi), %xmm1 3279; SSE41-NEXT: movdqa (%rsi), %xmm0 3280; SSE41-NEXT: movdqa %xmm1, %xmm2 3281; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 3282; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 3283; SSE41-NEXT: movdqa %xmm1, %xmm3 3284; SSE41-NEXT: pminsb %xmm0, %xmm3 3285; SSE41-NEXT: pmaxsb %xmm1, %xmm0 3286; SSE41-NEXT: psubb %xmm3, %xmm0 3287; SSE41-NEXT: psrlw $1, %xmm0 3288; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3289; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3290; SSE41-NEXT: movdqa %xmm2, %xmm4 3291; SSE41-NEXT: pand %xmm3, %xmm4 3292; SSE41-NEXT: movdqa %xmm0, %xmm5 3293; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 3294; SSE41-NEXT: pand %xmm3, %xmm5 3295; SSE41-NEXT: pandn %xmm2, %xmm3 3296; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 3297; SSE41-NEXT: psllw $8, %xmm0 3298; SSE41-NEXT: por %xmm5, %xmm0 3299; SSE41-NEXT: paddb %xmm1, %xmm0 3300; SSE41-NEXT: retq 3301; 3302; AVX1-LABEL: vec128_i8_signed_mem_mem: 3303; AVX1: # %bb.0: 3304; AVX1-NEXT: vmovdqa (%rdi), %xmm0 3305; AVX1-NEXT: vmovdqa (%rsi), %xmm1 3306; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3307; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3308; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3309; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3310; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3311; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 3312; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3313; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3314; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 3315; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3316; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 3317; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 3318; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3319; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 3320; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 3321; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3322; AVX1-NEXT: retq 3323; 3324; AVX2-LABEL: vec128_i8_signed_mem_mem: 3325; AVX2: # %bb.0: 3326; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3327; AVX2-NEXT: vmovdqa (%rsi), %xmm1 3328; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3329; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3330; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3331; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3332; AVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3333; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 3334; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3335; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3336; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3337; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3338; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3339; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3340; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3341; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3342; AVX2-NEXT: vzeroupper 3343; AVX2-NEXT: retq 3344; 3345; XOP-FALLBACK-LABEL: vec128_i8_signed_mem_mem: 3346; XOP-FALLBACK: # %bb.0: 3347; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 3348; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 3349; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3350; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3351; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3352; XOP-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3353; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3354; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3355; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3356; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3357; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 3358; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3359; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 3360; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3361; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 3362; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3363; XOP-FALLBACK-NEXT: retq 3364; 3365; XOPAVX1-LABEL: vec128_i8_signed_mem_mem: 3366; XOPAVX1: # %bb.0: 3367; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 3368; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1 3369; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3370; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3371; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3372; XOPAVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3373; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3374; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3375; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3376; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 3377; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 3378; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 3379; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 3380; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 3381; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] 3382; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3383; XOPAVX1-NEXT: retq 3384; 3385; XOPAVX2-LABEL: vec128_i8_signed_mem_mem: 3386; XOPAVX2: # %bb.0: 3387; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 3388; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 3389; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 3390; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3391; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3392; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3393; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3394; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 3395; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 3396; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3397; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3398; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3399; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3400; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3401; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3402; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3403; XOPAVX2-NEXT: vzeroupper 3404; XOPAVX2-NEXT: retq 3405; 3406; AVX512F-LABEL: vec128_i8_signed_mem_mem: 3407; AVX512F: # %bb.0: 3408; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3409; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 3410; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 3411; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3412; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 3413; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3414; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 3415; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 3416; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3417; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3418; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 3419; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3420; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 3421; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 3422; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3423; AVX512F-NEXT: vzeroupper 3424; AVX512F-NEXT: retq 3425; 3426; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_mem: 3427; AVX512VL-FALLBACK: # %bb.0: 3428; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 3429; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 3430; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3431; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 3432; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 3433; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 3434; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 3435; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) 3436; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 3437; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3438; AVX512VL-FALLBACK-NEXT: retq 3439; 3440; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_mem: 3441; AVX512BW-FALLBACK: # %bb.0: 3442; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 3443; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 3444; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 3445; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 3446; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3447; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 3448; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3449; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3450; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 3451; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 3452; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3453; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3454; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 3455; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3456; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 3457; AVX512BW-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3458; AVX512BW-FALLBACK-NEXT: vzeroupper 3459; AVX512BW-FALLBACK-NEXT: retq 3460; 3461; AVX512VLBW-LABEL: vec128_i8_signed_mem_mem: 3462; AVX512VLBW: # %bb.0: 3463; AVX512VLBW-NEXT: vmovdqa (%rdi), %xmm0 3464; AVX512VLBW-NEXT: vmovdqa (%rsi), %xmm1 3465; AVX512VLBW-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 3466; AVX512VLBW-NEXT: vpminsb %xmm1, %xmm0, %xmm2 3467; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 3468; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 3469; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 3470; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 3471; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3472; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} 3473; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3474; AVX512VLBW-NEXT: retq 3475 %a1 = load <16 x i8>, ptr %a1_addr 3476 %a2 = load <16 x i8>, ptr %a2_addr 3477 %t3 = icmp sgt <16 x i8> %a1, %a2 ; signed 3478 %t4 = select <16 x i1> %t3, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3479 %t5 = select <16 x i1> %t3, <16 x i8> %a2, <16 x i8> %a1 3480 %t6 = select <16 x i1> %t3, <16 x i8> %a1, <16 x i8> %a2 3481 %t7 = sub <16 x i8> %t6, %t5 3482 %t8 = lshr <16 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3483 %t9 = mul nsw <16 x i8> %t8, %t4 ; signed 3484 %a10 = add nsw <16 x i8> %t9, %a1 ; signed 3485 ret <16 x i8> %a10 3486} 3487