1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FALLBACK 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW-FALLBACK 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VLBW 11 12; These test cases are inspired by C++2a std::midpoint(). 13; See https://bugs.llvm.org/show_bug.cgi?id=40965 14 15; Using 256-bit vector regs. 16 17; ---------------------------------------------------------------------------- ; 18; 32-bit width. 256 / 32 = 8 elts. 19; ---------------------------------------------------------------------------- ; 20 21; Values come from regs 22 23define <8 x i32> @vec256_i32_signed_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwind { 24; AVX1-LABEL: vec256_i32_signed_reg_reg: 25; AVX1: # %bb.0: 26; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 27; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm3 28; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 29; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 30; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 31; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm4 32; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 33; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 34; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 35; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 36; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 37; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 38; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 39; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 40; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 41; AVX1-NEXT: retq 42; 43; AVX2-LABEL: vec256_i32_signed_reg_reg: 44; AVX2: # %bb.0: 45; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm2 46; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 47; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 48; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 49; AVX2-NEXT: vpmulld %ymm1, %ymm1, %ymm1 50; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 51; AVX2-NEXT: retq 52; 53; XOP-LABEL: vec256_i32_signed_reg_reg: 54; XOP: # %bb.0: 55; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 56; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 57; XOP-NEXT: vpminsd %xmm2, %xmm3, %xmm4 58; XOP-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 59; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 60; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm4 61; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 62; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 63; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 64; XOP-NEXT: vpsrld $1, %xmm2, %xmm2 65; XOP-NEXT: vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2 66; XOP-NEXT: vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0 67; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 68; XOP-NEXT: retq 69; 70; AVX512-LABEL: vec256_i32_signed_reg_reg: 71; AVX512: # %bb.0: 72; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm2 73; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 74; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 75; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 76; AVX512-NEXT: vpmulld %ymm1, %ymm1, %ymm1 77; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 78; AVX512-NEXT: retq 79 %t3 = icmp sgt <8 x i32> %a1, %a2 ; signed 80 %t4 = select <8 x i1> %t3, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 81 %t5 = select <8 x i1> %t3, <8 x i32> %a2, <8 x i32> %a1 82 %t6 = select <8 x i1> %t3, <8 x i32> %a1, <8 x i32> %a2 83 %t7 = sub <8 x i32> %t6, %t5 84 %t8 = lshr <8 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 85 %t9 = mul nsw <8 x i32> %t8, %t8 ; signed 86 %a10 = add nsw <8 x i32> %t9, %a1 ; signed 87 ret <8 x i32> %a10 88} 89 90define <8 x i32> @vec256_i32_unsigned_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwind { 91; AVX1-LABEL: vec256_i32_unsigned_reg_reg: 92; AVX1: # %bb.0: 93; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 94; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3 95; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 96; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 97; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 98; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm4 99; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 100; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 101; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 102; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 103; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 104; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 105; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 106; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 107; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 108; AVX1-NEXT: retq 109; 110; AVX2-LABEL: vec256_i32_unsigned_reg_reg: 111; AVX2: # %bb.0: 112; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2 113; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm1 114; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 115; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 116; AVX2-NEXT: vpmulld %ymm1, %ymm1, %ymm1 117; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 118; AVX2-NEXT: retq 119; 120; XOP-LABEL: vec256_i32_unsigned_reg_reg: 121; XOP: # %bb.0: 122; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 123; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 124; XOP-NEXT: vpminud %xmm2, %xmm3, %xmm4 125; XOP-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 126; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 127; XOP-NEXT: vpminud %xmm1, %xmm0, %xmm4 128; XOP-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 129; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 130; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 131; XOP-NEXT: vpsrld $1, %xmm2, %xmm2 132; XOP-NEXT: vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2 133; XOP-NEXT: vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0 134; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 135; XOP-NEXT: retq 136; 137; AVX512-LABEL: vec256_i32_unsigned_reg_reg: 138; AVX512: # %bb.0: 139; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm2 140; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm1 141; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 142; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 143; AVX512-NEXT: vpmulld %ymm1, %ymm1, %ymm1 144; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 145; AVX512-NEXT: retq 146 %t3 = icmp ugt <8 x i32> %a1, %a2 147 %t4 = select <8 x i1> %t3, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 148 %t5 = select <8 x i1> %t3, <8 x i32> %a2, <8 x i32> %a1 149 %t6 = select <8 x i1> %t3, <8 x i32> %a1, <8 x i32> %a2 150 %t7 = sub <8 x i32> %t6, %t5 151 %t8 = lshr <8 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 152 %t9 = mul <8 x i32> %t8, %t8 153 %a10 = add <8 x i32> %t9, %a1 154 ret <8 x i32> %a10 155} 156 157; Values are loaded. Only check signed case. 158 159define <8 x i32> @vec256_i32_signed_mem_reg(ptr %a1_addr, <8 x i32> %a2) nounwind { 160; AVX1-LABEL: vec256_i32_signed_mem_reg: 161; AVX1: # %bb.0: 162; AVX1-NEXT: vmovdqa (%rdi), %xmm1 163; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 164; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 165; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm4 166; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 167; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 168; AVX1-NEXT: vpminsd %xmm0, %xmm2, %xmm4 169; AVX1-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 170; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 171; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 172; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 173; AVX1-NEXT: vpmulld %xmm3, %xmm3, %xmm3 174; AVX1-NEXT: vpmulld %xmm0, %xmm0, %xmm0 175; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 176; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 177; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 178; AVX1-NEXT: retq 179; 180; AVX2-LABEL: vec256_i32_signed_mem_reg: 181; AVX2: # %bb.0: 182; AVX2-NEXT: vmovdqa (%rdi), %ymm1 183; AVX2-NEXT: vpminsd %ymm0, %ymm1, %ymm2 184; AVX2-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 185; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 186; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 187; AVX2-NEXT: vpmulld %ymm0, %ymm0, %ymm0 188; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 189; AVX2-NEXT: retq 190; 191; XOP-LABEL: vec256_i32_signed_mem_reg: 192; XOP: # %bb.0: 193; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 194; XOP-NEXT: vmovdqa (%rdi), %xmm2 195; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 196; XOP-NEXT: vpminsd %xmm1, %xmm3, %xmm4 197; XOP-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 198; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 199; XOP-NEXT: vpminsd %xmm0, %xmm2, %xmm4 200; XOP-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 201; XOP-NEXT: vpsubd %xmm4, %xmm0, %xmm0 202; XOP-NEXT: vpsrld $1, %xmm0, %xmm0 203; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 204; XOP-NEXT: vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1 205; XOP-NEXT: vpmacsdd %xmm2, %xmm0, %xmm0, %xmm0 206; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 207; XOP-NEXT: retq 208; 209; AVX512-LABEL: vec256_i32_signed_mem_reg: 210; AVX512: # %bb.0: 211; AVX512-NEXT: vmovdqa (%rdi), %ymm1 212; AVX512-NEXT: vpminsd %ymm0, %ymm1, %ymm2 213; AVX512-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 214; AVX512-NEXT: vpsubd %ymm2, %ymm0, %ymm0 215; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 216; AVX512-NEXT: vpmulld %ymm0, %ymm0, %ymm0 217; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 218; AVX512-NEXT: retq 219 %a1 = load <8 x i32>, ptr %a1_addr 220 %t3 = icmp sgt <8 x i32> %a1, %a2 ; signed 221 %t4 = select <8 x i1> %t3, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 222 %t5 = select <8 x i1> %t3, <8 x i32> %a2, <8 x i32> %a1 223 %t6 = select <8 x i1> %t3, <8 x i32> %a1, <8 x i32> %a2 224 %t7 = sub <8 x i32> %t6, %t5 225 %t8 = lshr <8 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 226 %t9 = mul nsw <8 x i32> %t8, %t8 ; signed 227 %a10 = add nsw <8 x i32> %t9, %a1 ; signed 228 ret <8 x i32> %a10 229} 230 231define <8 x i32> @vec256_i32_signed_reg_mem(<8 x i32> %a1, ptr %a2_addr) nounwind { 232; AVX1-LABEL: vec256_i32_signed_reg_mem: 233; AVX1: # %bb.0: 234; AVX1-NEXT: vmovdqa (%rdi), %xmm1 235; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 236; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 237; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 238; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 239; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 240; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm4 241; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 242; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 243; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 244; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 245; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 246; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 247; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 248; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 249; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 250; AVX1-NEXT: retq 251; 252; AVX2-LABEL: vec256_i32_signed_reg_mem: 253; AVX2: # %bb.0: 254; AVX2-NEXT: vmovdqa (%rdi), %ymm1 255; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm2 256; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 257; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 258; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 259; AVX2-NEXT: vpmulld %ymm1, %ymm1, %ymm1 260; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 261; AVX2-NEXT: retq 262; 263; XOP-LABEL: vec256_i32_signed_reg_mem: 264; XOP: # %bb.0: 265; XOP-NEXT: vmovdqa (%rdi), %xmm1 266; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 267; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 268; XOP-NEXT: vpminsd %xmm2, %xmm3, %xmm4 269; XOP-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 270; XOP-NEXT: vpsubd %xmm4, %xmm2, %xmm2 271; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm4 272; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 273; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 274; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 275; XOP-NEXT: vpsrld $1, %xmm2, %xmm2 276; XOP-NEXT: vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2 277; XOP-NEXT: vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0 278; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 279; XOP-NEXT: retq 280; 281; AVX512-LABEL: vec256_i32_signed_reg_mem: 282; AVX512: # %bb.0: 283; AVX512-NEXT: vmovdqa (%rdi), %ymm1 284; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm2 285; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 286; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 287; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 288; AVX512-NEXT: vpmulld %ymm1, %ymm1, %ymm1 289; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 290; AVX512-NEXT: retq 291 %a2 = load <8 x i32>, ptr %a2_addr 292 %t3 = icmp sgt <8 x i32> %a1, %a2 ; signed 293 %t4 = select <8 x i1> %t3, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 294 %t5 = select <8 x i1> %t3, <8 x i32> %a2, <8 x i32> %a1 295 %t6 = select <8 x i1> %t3, <8 x i32> %a1, <8 x i32> %a2 296 %t7 = sub <8 x i32> %t6, %t5 297 %t8 = lshr <8 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 298 %t9 = mul nsw <8 x i32> %t8, %t8 ; signed 299 %a10 = add nsw <8 x i32> %t9, %a1 ; signed 300 ret <8 x i32> %a10 301} 302 303define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 304; AVX1-LABEL: vec256_i32_signed_mem_mem: 305; AVX1: # %bb.0: 306; AVX1-NEXT: vmovdqa (%rsi), %xmm0 307; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 308; AVX1-NEXT: vmovdqa (%rdi), %xmm2 309; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 310; AVX1-NEXT: vpminsd %xmm0, %xmm2, %xmm4 311; AVX1-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 312; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 313; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm4 314; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 315; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 316; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 317; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 318; AVX1-NEXT: vpmulld %xmm0, %xmm0, %xmm0 319; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 320; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 321; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 322; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 323; AVX1-NEXT: retq 324; 325; AVX2-LABEL: vec256_i32_signed_mem_mem: 326; AVX2: # %bb.0: 327; AVX2-NEXT: vmovdqa (%rdi), %ymm0 328; AVX2-NEXT: vmovdqa (%rsi), %ymm1 329; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm2 330; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 331; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 332; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 333; AVX2-NEXT: vpmulld %ymm1, %ymm1, %ymm1 334; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 335; AVX2-NEXT: retq 336; 337; XOP-LABEL: vec256_i32_signed_mem_mem: 338; XOP: # %bb.0: 339; XOP-NEXT: vmovdqa (%rsi), %xmm0 340; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 341; XOP-NEXT: vmovdqa (%rdi), %xmm2 342; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 343; XOP-NEXT: vpminsd %xmm1, %xmm3, %xmm4 344; XOP-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 345; XOP-NEXT: vpsubd %xmm4, %xmm1, %xmm1 346; XOP-NEXT: vpminsd %xmm0, %xmm2, %xmm4 347; XOP-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 348; XOP-NEXT: vpsubd %xmm4, %xmm0, %xmm0 349; XOP-NEXT: vpsrld $1, %xmm0, %xmm0 350; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 351; XOP-NEXT: vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1 352; XOP-NEXT: vpmacsdd %xmm2, %xmm0, %xmm0, %xmm0 353; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 354; XOP-NEXT: retq 355; 356; AVX512-LABEL: vec256_i32_signed_mem_mem: 357; AVX512: # %bb.0: 358; AVX512-NEXT: vmovdqa (%rdi), %ymm0 359; AVX512-NEXT: vmovdqa (%rsi), %ymm1 360; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm2 361; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm1 362; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 363; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 364; AVX512-NEXT: vpmulld %ymm1, %ymm1, %ymm1 365; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 366; AVX512-NEXT: retq 367 %a1 = load <8 x i32>, ptr %a1_addr 368 %a2 = load <8 x i32>, ptr %a2_addr 369 %t3 = icmp sgt <8 x i32> %a1, %a2 ; signed 370 %t4 = select <8 x i1> %t3, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 371 %t5 = select <8 x i1> %t3, <8 x i32> %a2, <8 x i32> %a1 372 %t6 = select <8 x i1> %t3, <8 x i32> %a1, <8 x i32> %a2 373 %t7 = sub <8 x i32> %t6, %t5 374 %t8 = lshr <8 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 375 %t9 = mul nsw <8 x i32> %t8, %t8 ; signed 376 %a10 = add nsw <8 x i32> %t9, %a1 ; signed 377 ret <8 x i32> %a10 378} 379 380; ---------------------------------------------------------------------------- ; 381; 64-bit width. 256 / 64 = 4 elts. 382; ---------------------------------------------------------------------------- ; 383 384; Values come from regs 385 386define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwind { 387; AVX1-LABEL: vec256_i64_signed_reg_reg: 388; AVX1: # %bb.0: 389; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 390; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 391; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 392; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 393; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 394; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 395; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 396; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 397; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 398; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 399; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 400; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 401; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 402; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 403; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 404; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 405; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 406; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 407; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 408; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 409; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 410; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 411; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 412; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 413; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 414; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 415; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 416; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 417; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 418; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 419; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 420; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 421; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 422; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 423; AVX1-NEXT: retq 424; 425; AVX2-LABEL: vec256_i64_signed_reg_reg: 426; AVX2: # %bb.0: 427; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 428; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 429; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 430; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 431; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 432; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 433; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 434; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 435; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 436; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 437; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 438; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 439; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 440; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 441; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 442; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 443; AVX2-NEXT: retq 444; 445; XOP-LABEL: vec256_i64_signed_reg_reg: 446; XOP: # %bb.0: 447; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 448; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 449; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 450; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 451; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 452; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 453; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 454; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 455; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 456; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 457; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 458; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 459; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 460; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 461; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 462; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 463; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 464; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 465; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 466; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 467; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 468; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 469; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 470; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 471; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 472; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 473; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 474; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 475; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 476; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 477; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 478; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 479; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 480; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 481; XOP-NEXT: retq 482; 483; AVX512F-LABEL: vec256_i64_signed_reg_reg: 484; AVX512F: # %bb.0: 485; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 486; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 487; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 488; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 489; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 490; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 491; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 492; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 493; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 494; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 495; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 496; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 497; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 498; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 499; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 500; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 501; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 502; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 503; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 504; AVX512F-NEXT: retq 505; 506; AVX512VL-LABEL: vec256_i64_signed_reg_reg: 507; AVX512VL: # %bb.0: 508; AVX512VL-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 509; AVX512VL-NEXT: vpminsq %ymm1, %ymm0, %ymm2 510; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm1 511; AVX512VL-NEXT: vpsubq %ymm2, %ymm1, %ymm1 512; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 513; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 514; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 {%k1} 515; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 516; AVX512VL-NEXT: retq 517; 518; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_reg_reg: 519; AVX512BW-FALLBACK: # %bb.0: 520; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 521; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 522; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 523; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 524; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 525; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 526; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 527; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 528; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 529; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 530; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 531; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 532; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 533; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 534; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 535; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 536; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 537; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 538; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 539; AVX512BW-FALLBACK-NEXT: retq 540 %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed 541 %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1> 542 %t5 = select <4 x i1> %t3, <4 x i64> %a2, <4 x i64> %a1 543 %t6 = select <4 x i1> %t3, <4 x i64> %a1, <4 x i64> %a2 544 %t7 = sub <4 x i64> %t6, %t5 545 %t8 = lshr <4 x i64> %t7, <i64 1, i64 1, i64 1, i64 1> 546 %t9 = mul nsw <4 x i64> %t8, %t4 ; signed 547 %a10 = add nsw <4 x i64> %t9, %a1 ; signed 548 ret <4 x i64> %a10 549} 550 551define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwind { 552; AVX1-LABEL: vec256_i64_unsigned_reg_reg: 553; AVX1: # %bb.0: 554; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 555; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 556; AVX1-NEXT: # xmm4 = mem[0,0] 557; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 558; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 559; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 560; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 561; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6 562; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 563; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 564; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 565; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 566; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 567; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 568; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 569; AVX1-NEXT: vpsubq %xmm3, %xmm5, %xmm3 570; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 571; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 572; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 573; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 574; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 575; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 576; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 577; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 578; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 579; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 580; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4 581; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 582; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7 583; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 584; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 585; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 586; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 587; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 588; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5 589; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 590; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 591; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 592; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 593; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 594; AVX1-NEXT: retq 595; 596; AVX2-LABEL: vec256_i64_unsigned_reg_reg: 597; AVX2: # %bb.0: 598; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 599; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 600; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 601; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 602; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 603; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 604; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 605; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 606; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 607; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 608; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 609; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 610; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 611; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 612; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 613; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 614; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 615; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 616; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 617; AVX2-NEXT: retq 618; 619; XOP-LABEL: vec256_i64_unsigned_reg_reg: 620; XOP: # %bb.0: 621; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 622; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 623; XOP-NEXT: vpcomgtuq %xmm2, %xmm3, %xmm4 624; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm5 625; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 626; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 627; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 628; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 629; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 630; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 631; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 632; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 633; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 634; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 635; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 636; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 637; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 638; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 639; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 640; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 641; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 642; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 643; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 644; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 645; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 646; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 647; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 648; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 649; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 650; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 651; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 652; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 653; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 654; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 655; XOP-NEXT: retq 656; 657; AVX512F-LABEL: vec256_i64_unsigned_reg_reg: 658; AVX512F: # %bb.0: 659; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 660; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 661; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 662; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 663; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 664; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 665; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 666; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 667; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 668; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 669; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 670; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 671; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 672; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 673; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 674; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 675; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 676; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 677; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 678; AVX512F-NEXT: retq 679; 680; AVX512VL-LABEL: vec256_i64_unsigned_reg_reg: 681; AVX512VL: # %bb.0: 682; AVX512VL-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 683; AVX512VL-NEXT: vpminuq %ymm1, %ymm0, %ymm2 684; AVX512VL-NEXT: vpmaxuq %ymm1, %ymm0, %ymm1 685; AVX512VL-NEXT: vpsubq %ymm2, %ymm1, %ymm1 686; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 687; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 688; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 {%k1} 689; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 690; AVX512VL-NEXT: retq 691; 692; AVX512BW-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg: 693; AVX512BW-FALLBACK: # %bb.0: 694; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 695; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 696; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 697; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 698; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 699; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 700; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 701; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 702; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 703; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 704; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 705; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 706; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 707; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 708; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 709; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 710; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 711; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 712; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 713; AVX512BW-FALLBACK-NEXT: retq 714 %t3 = icmp ugt <4 x i64> %a1, %a2 715 %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1> 716 %t5 = select <4 x i1> %t3, <4 x i64> %a2, <4 x i64> %a1 717 %t6 = select <4 x i1> %t3, <4 x i64> %a1, <4 x i64> %a2 718 %t7 = sub <4 x i64> %t6, %t5 719 %t8 = lshr <4 x i64> %t7, <i64 1, i64 1, i64 1, i64 1> 720 %t9 = mul <4 x i64> %t8, %t4 721 %a10 = add <4 x i64> %t9, %a1 722 ret <4 x i64> %a10 723} 724 725; Values are loaded. Only check signed case. 726 727define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwind { 728; AVX1-LABEL: vec256_i64_signed_mem_reg: 729; AVX1: # %bb.0: 730; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 731; AVX1-NEXT: vmovdqa (%rdi), %xmm2 732; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 733; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 734; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 735; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 736; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 737; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 738; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 739; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 740; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 741; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 742; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 743; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 744; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 745; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 746; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 747; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 748; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 749; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 750; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 751; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 752; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 753; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 754; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 755; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 756; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 757; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 758; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 759; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 760; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 761; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 762; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 763; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 764; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 765; AVX1-NEXT: retq 766; 767; AVX2-LABEL: vec256_i64_signed_mem_reg: 768; AVX2: # %bb.0: 769; AVX2-NEXT: vmovdqa (%rdi), %ymm1 770; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 771; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 772; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 773; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 774; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 775; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 776; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 777; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0 778; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 779; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 780; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 781; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 782; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 783; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 784; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 785; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 786; AVX2-NEXT: retq 787; 788; XOP-LABEL: vec256_i64_signed_mem_reg: 789; XOP: # %bb.0: 790; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 791; XOP-NEXT: vmovdqa (%rdi), %xmm2 792; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 793; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 794; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 795; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 796; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 797; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 798; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 799; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 800; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 801; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 802; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 803; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 804; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 805; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 806; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 807; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 808; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 809; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 810; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 811; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 812; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 813; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 814; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 815; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 816; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 817; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 818; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 819; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 820; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 821; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 822; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 823; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 824; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 825; XOP-NEXT: retq 826; 827; AVX512F-LABEL: vec256_i64_signed_mem_reg: 828; AVX512F: # %bb.0: 829; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 830; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 831; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 832; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 833; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 834; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 835; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 836; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 837; AVX512F-NEXT: vpsubq %ymm2, %ymm0, %ymm0 838; AVX512F-NEXT: vpsrlq $1, %ymm0, %ymm2 839; AVX512F-NEXT: vpsrlq $33, %ymm0, %ymm0 840; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 841; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 842; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 843; AVX512F-NEXT: vpaddq %ymm0, %ymm4, %ymm0 844; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 845; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 846; AVX512F-NEXT: vpaddq %ymm1, %ymm2, %ymm1 847; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 848; AVX512F-NEXT: retq 849; 850; AVX512VL-LABEL: vec256_i64_signed_mem_reg: 851; AVX512VL: # %bb.0: 852; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 853; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 854; AVX512VL-NEXT: vpminsq %ymm0, %ymm1, %ymm2 855; AVX512VL-NEXT: vpmaxsq %ymm0, %ymm1, %ymm0 856; AVX512VL-NEXT: vpsubq %ymm2, %ymm0, %ymm0 857; AVX512VL-NEXT: vpsrlq $1, %ymm0, %ymm0 858; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 859; AVX512VL-NEXT: vpsubq %ymm0, %ymm2, %ymm0 {%k1} 860; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 861; AVX512VL-NEXT: retq 862; 863; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_mem_reg: 864; AVX512BW-FALLBACK: # %bb.0: 865; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 866; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 867; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 868; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 869; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 870; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 871; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 872; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 873; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm0, %ymm0 874; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm0, %ymm2 875; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm0, %ymm0 876; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 877; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 878; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 879; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 880; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm0, %ymm0 881; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 882; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 883; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 884; AVX512BW-FALLBACK-NEXT: retq 885 %a1 = load <4 x i64>, ptr %a1_addr 886 %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed 887 %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1> 888 %t5 = select <4 x i1> %t3, <4 x i64> %a2, <4 x i64> %a1 889 %t6 = select <4 x i1> %t3, <4 x i64> %a1, <4 x i64> %a2 890 %t7 = sub <4 x i64> %t6, %t5 891 %t8 = lshr <4 x i64> %t7, <i64 1, i64 1, i64 1, i64 1> 892 %t9 = mul nsw <4 x i64> %t8, %t4 ; signed 893 %a10 = add nsw <4 x i64> %t9, %a1 ; signed 894 ret <4 x i64> %a10 895} 896 897define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwind { 898; AVX1-LABEL: vec256_i64_signed_reg_mem: 899; AVX1: # %bb.0: 900; AVX1-NEXT: vmovdqa (%rdi), %xmm1 901; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 902; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 903; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 904; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 905; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 906; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 907; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 908; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 909; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 910; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 911; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 912; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 913; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 914; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 915; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 916; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 917; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 918; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 919; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 920; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 921; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 922; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 923; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 924; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 925; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 926; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 927; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 928; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 929; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 930; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 931; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 932; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 933; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 934; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 935; AVX1-NEXT: retq 936; 937; AVX2-LABEL: vec256_i64_signed_reg_mem: 938; AVX2: # %bb.0: 939; AVX2-NEXT: vmovdqa (%rdi), %ymm1 940; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 941; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 942; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 943; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 944; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 945; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 946; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 947; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 948; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 949; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 950; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 951; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 952; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 953; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 954; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 955; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 956; AVX2-NEXT: retq 957; 958; XOP-LABEL: vec256_i64_signed_reg_mem: 959; XOP: # %bb.0: 960; XOP-NEXT: vmovdqa (%rdi), %xmm1 961; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 962; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 963; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 964; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 965; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 966; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 967; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 968; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 969; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 970; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 971; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 972; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 973; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 974; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 975; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 976; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 977; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 978; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 979; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1 980; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 981; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 982; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2 983; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 984; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 985; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 986; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 987; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2 988; XOP-NEXT: vpsllq $32, %xmm2, %xmm2 989; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 990; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 991; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2 992; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 993; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 994; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 995; XOP-NEXT: retq 996; 997; AVX512F-LABEL: vec256_i64_signed_reg_mem: 998; AVX512F: # %bb.0: 999; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1000; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 1001; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1002; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1003; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 1004; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1005; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1006; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1007; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1008; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 1009; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 1010; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1011; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 1012; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 1013; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 1014; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 1015; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1016; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1017; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1018; AVX512F-NEXT: retq 1019; 1020; AVX512VL-LABEL: vec256_i64_signed_reg_mem: 1021; AVX512VL: # %bb.0: 1022; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 1023; AVX512VL-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 1024; AVX512VL-NEXT: vpminsq %ymm1, %ymm0, %ymm2 1025; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm1 1026; AVX512VL-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1027; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 1028; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1029; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 {%k1} 1030; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 1031; AVX512VL-NEXT: retq 1032; 1033; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_reg_mem: 1034; AVX512BW-FALLBACK: # %bb.0: 1035; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1036; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1037; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1038; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1039; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 1040; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1041; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1042; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1043; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1044; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 1045; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 1046; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1047; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 1048; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 1049; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 1050; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 1051; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1052; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1053; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1054; AVX512BW-FALLBACK-NEXT: retq 1055 %a2 = load <4 x i64>, ptr %a2_addr 1056 %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed 1057 %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1> 1058 %t5 = select <4 x i1> %t3, <4 x i64> %a2, <4 x i64> %a1 1059 %t6 = select <4 x i1> %t3, <4 x i64> %a1, <4 x i64> %a2 1060 %t7 = sub <4 x i64> %t6, %t5 1061 %t8 = lshr <4 x i64> %t7, <i64 1, i64 1, i64 1, i64 1> 1062 %t9 = mul nsw <4 x i64> %t8, %t4 ; signed 1063 %a10 = add nsw <4 x i64> %t9, %a1 ; signed 1064 ret <4 x i64> %a10 1065} 1066 1067define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 1068; AVX1-LABEL: vec256_i64_signed_mem_mem: 1069; AVX1: # %bb.0: 1070; AVX1-NEXT: vmovdqa (%rsi), %xmm0 1071; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 1072; AVX1-NEXT: vmovdqa (%rdi), %xmm2 1073; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 1074; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 1075; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 1076; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 1077; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 1078; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 1079; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 1080; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 1081; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 1082; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 1083; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 1084; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 1085; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 1086; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 1087; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 1088; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 1089; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 1090; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 1091; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 1092; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 1093; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 1094; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7 1095; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 1096; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 1097; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 1098; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1099; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 1100; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 1101; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 1102; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1103; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 1104; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1105; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1106; AVX1-NEXT: retq 1107; 1108; AVX2-LABEL: vec256_i64_signed_mem_mem: 1109; AVX2: # %bb.0: 1110; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1111; AVX2-NEXT: vmovdqa (%rsi), %ymm1 1112; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 1113; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 1114; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 1115; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 1116; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 1117; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 1118; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 1119; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 1120; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1121; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 1122; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 1123; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 1124; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 1125; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 1126; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1127; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1128; AVX2-NEXT: retq 1129; 1130; XOP-LABEL: vec256_i64_signed_mem_mem: 1131; XOP: # %bb.0: 1132; XOP-NEXT: vmovdqa (%rsi), %xmm0 1133; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 1134; XOP-NEXT: vmovdqa (%rdi), %xmm2 1135; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 1136; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 1137; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 1138; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 1139; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 1140; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 1141; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 1142; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 1143; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 1144; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 1145; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 1146; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 1147; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] 1148; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 1149; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 1150; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 1151; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 1152; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0 1153; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1154; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5 1155; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 1156; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7 1157; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 1158; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4 1159; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 1160; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 1161; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 1162; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4 1163; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3 1164; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1165; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2 1166; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1167; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1168; XOP-NEXT: retq 1169; 1170; AVX512F-LABEL: vec256_i64_signed_mem_mem: 1171; AVX512F: # %bb.0: 1172; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1173; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 1174; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1175; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1176; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 1177; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1178; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1179; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1180; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1181; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 1182; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 1183; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1184; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 1185; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 1186; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 1187; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 1188; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1189; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1190; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1191; AVX512F-NEXT: retq 1192; 1193; AVX512VL-LABEL: vec256_i64_signed_mem_mem: 1194; AVX512VL: # %bb.0: 1195; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1196; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 1197; AVX512VL-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 1198; AVX512VL-NEXT: vpminsq %ymm1, %ymm0, %ymm2 1199; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm1 1200; AVX512VL-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1201; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 1202; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1203; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 {%k1} 1204; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 1205; AVX512VL-NEXT: retq 1206; 1207; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_mem_mem: 1208; AVX512BW-FALLBACK: # %bb.0: 1209; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 1210; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 1211; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 1212; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1213; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] 1214; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} 1215; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 1216; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 1217; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1218; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 1219; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 1220; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1221; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 1222; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 1223; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 1224; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 1225; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1226; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1227; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1228; AVX512BW-FALLBACK-NEXT: retq 1229 %a1 = load <4 x i64>, ptr %a1_addr 1230 %a2 = load <4 x i64>, ptr %a2_addr 1231 %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed 1232 %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1> 1233 %t5 = select <4 x i1> %t3, <4 x i64> %a2, <4 x i64> %a1 1234 %t6 = select <4 x i1> %t3, <4 x i64> %a1, <4 x i64> %a2 1235 %t7 = sub <4 x i64> %t6, %t5 1236 %t8 = lshr <4 x i64> %t7, <i64 1, i64 1, i64 1, i64 1> 1237 %t9 = mul nsw <4 x i64> %t8, %t4 ; signed 1238 %a10 = add nsw <4 x i64> %t9, %a1 ; signed 1239 ret <4 x i64> %a10 1240} 1241 1242; ---------------------------------------------------------------------------- ; 1243; 16-bit width. 256 / 16 = 16 elts. 1244; ---------------------------------------------------------------------------- ; 1245 1246; Values come from regs 1247 1248define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nounwind { 1249; AVX1-LABEL: vec256_i16_signed_reg_reg: 1250; AVX1: # %bb.0: 1251; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1252; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1253; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 1254; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 1255; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 1256; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1257; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1258; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 1259; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 1260; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 1261; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 1262; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1263; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1264; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 1265; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 1266; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 1267; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1268; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1269; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1270; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1271; AVX1-NEXT: retq 1272; 1273; AVX2-LABEL: vec256_i16_signed_reg_reg: 1274; AVX2: # %bb.0: 1275; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1276; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1277; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1278; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1279; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1280; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1281; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1282; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1283; AVX2-NEXT: retq 1284; 1285; XOP-LABEL: vec256_i16_signed_reg_reg: 1286; XOP: # %bb.0: 1287; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1288; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1289; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 1290; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 1291; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 1292; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 1293; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 1294; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 1295; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1296; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1297; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1298; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 1299; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1300; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1301; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1302; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 1303; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 1304; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1305; XOP-NEXT: retq 1306; 1307; AVX512F-LABEL: vec256_i16_signed_reg_reg: 1308; AVX512F: # %bb.0: 1309; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1310; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1311; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1312; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1313; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1314; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1315; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1316; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1317; AVX512F-NEXT: retq 1318; 1319; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_reg: 1320; AVX512VL-FALLBACK: # %bb.0: 1321; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1322; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 1323; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 1324; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1325; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 1326; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 1327; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 1328; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1329; AVX512VL-FALLBACK-NEXT: retq 1330; 1331; AVX512BW-FALLBACK-LABEL: vec256_i16_signed_reg_reg: 1332; AVX512BW-FALLBACK: # %bb.0: 1333; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1334; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1335; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 1336; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1337; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1338; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1339; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1340; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1341; AVX512BW-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1342; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1343; AVX512BW-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1344; AVX512BW-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1345; AVX512BW-FALLBACK-NEXT: retq 1346; 1347; AVX512VLBW-LABEL: vec256_i16_signed_reg_reg: 1348; AVX512VLBW: # %bb.0: 1349; AVX512VLBW-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 1350; AVX512VLBW-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1351; AVX512VLBW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1352; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1353; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1354; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1355; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm2, %ymm1 {%k1} 1356; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1357; AVX512VLBW-NEXT: retq 1358 %t3 = icmp sgt <16 x i16> %a1, %a2 ; signed 1359 %t4 = select <16 x i1> %t3, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1360 %t5 = select <16 x i1> %t3, <16 x i16> %a2, <16 x i16> %a1 1361 %t6 = select <16 x i1> %t3, <16 x i16> %a1, <16 x i16> %a2 1362 %t7 = sub <16 x i16> %t6, %t5 1363 %t16 = lshr <16 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1364 %t9 = mul nsw <16 x i16> %t16, %t4 ; signed 1365 %a10 = add nsw <16 x i16> %t9, %a1 ; signed 1366 ret <16 x i16> %a10 1367} 1368 1369define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nounwind { 1370; AVX1-LABEL: vec256_i16_unsigned_reg_reg: 1371; AVX1: # %bb.0: 1372; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1373; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1374; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm4 1375; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm5 1376; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 1377; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 1378; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm7 1379; AVX1-NEXT: vpcmpeqw %xmm7, %xmm0, %xmm8 1380; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 1381; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1382; AVX1-NEXT: vpsubw %xmm7, %xmm1, %xmm1 1383; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 1384; AVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2 1385; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 1386; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1387; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] 1388; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 1389; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 1390; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 1391; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1392; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1393; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1394; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1395; AVX1-NEXT: retq 1396; 1397; AVX2-LABEL: vec256_i16_unsigned_reg_reg: 1398; AVX2: # %bb.0: 1399; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm2 1400; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 1401; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 1402; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 1403; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 1404; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 1405; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1406; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1407; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1408; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1409; AVX2-NEXT: retq 1410; 1411; XOP-LABEL: vec256_i16_unsigned_reg_reg: 1412; XOP: # %bb.0: 1413; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1414; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1415; XOP-NEXT: vpcomgtuw %xmm2, %xmm3, %xmm4 1416; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm5 1417; XOP-NEXT: vpminuw %xmm2, %xmm3, %xmm6 1418; XOP-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 1419; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 1420; XOP-NEXT: vpminuw %xmm1, %xmm0, %xmm6 1421; XOP-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 1422; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1423; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1424; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 1425; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1426; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1427; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1428; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 1429; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 1430; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1431; XOP-NEXT: retq 1432; 1433; AVX512F-LABEL: vec256_i16_unsigned_reg_reg: 1434; AVX512F: # %bb.0: 1435; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm2 1436; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 1437; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 1438; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 1439; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 1440; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1441; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1442; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1443; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1444; AVX512F-NEXT: retq 1445; 1446; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg: 1447; AVX512VL-FALLBACK: # %bb.0: 1448; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 1449; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 1450; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1451; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1452; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 1453; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 1454; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1 1455; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1456; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1457; AVX512VL-FALLBACK-NEXT: retq 1458; 1459; AVX512BW-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg: 1460; AVX512BW-FALLBACK: # %bb.0: 1461; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1462; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1463; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 1464; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1465; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1466; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1467; AVX512BW-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 1468; AVX512BW-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 1469; AVX512BW-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1470; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1471; AVX512BW-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1472; AVX512BW-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1473; AVX512BW-FALLBACK-NEXT: retq 1474; 1475; AVX512VLBW-LABEL: vec256_i16_unsigned_reg_reg: 1476; AVX512VLBW: # %bb.0: 1477; AVX512VLBW-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 1478; AVX512VLBW-NEXT: vpminuw %ymm1, %ymm0, %ymm2 1479; AVX512VLBW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 1480; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1481; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1482; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1483; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm2, %ymm1 {%k1} 1484; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1485; AVX512VLBW-NEXT: retq 1486 %t3 = icmp ugt <16 x i16> %a1, %a2 1487 %t4 = select <16 x i1> %t3, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1488 %t5 = select <16 x i1> %t3, <16 x i16> %a2, <16 x i16> %a1 1489 %t6 = select <16 x i1> %t3, <16 x i16> %a1, <16 x i16> %a2 1490 %t7 = sub <16 x i16> %t6, %t5 1491 %t16 = lshr <16 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1492 %t9 = mul <16 x i16> %t16, %t4 1493 %a10 = add <16 x i16> %t9, %a1 1494 ret <16 x i16> %a10 1495} 1496 1497; Values are loaded. Only check signed case. 1498 1499define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounwind { 1500; AVX1-LABEL: vec256_i16_signed_mem_reg: 1501; AVX1: # %bb.0: 1502; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1503; AVX1-NEXT: vmovdqa (%rdi), %xmm2 1504; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 1505; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm4 1506; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 1507; AVX1-NEXT: vpminsw %xmm0, %xmm2, %xmm6 1508; AVX1-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 1509; AVX1-NEXT: vpsubw %xmm6, %xmm0, %xmm0 1510; AVX1-NEXT: vpminsw %xmm1, %xmm3, %xmm6 1511; AVX1-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 1512; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1513; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1514; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 1515; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1516; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 1517; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 1518; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 1519; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1520; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 1521; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 1522; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1523; AVX1-NEXT: retq 1524; 1525; AVX2-LABEL: vec256_i16_signed_mem_reg: 1526; AVX2: # %bb.0: 1527; AVX2-NEXT: vmovdqa (%rdi), %ymm1 1528; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 1529; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1530; AVX2-NEXT: vpminsw %ymm0, %ymm1, %ymm3 1531; AVX2-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 1532; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0 1533; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 1534; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1535; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1536; AVX2-NEXT: retq 1537; 1538; XOP-LABEL: vec256_i16_signed_mem_reg: 1539; XOP: # %bb.0: 1540; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1541; XOP-NEXT: vmovdqa (%rdi), %xmm2 1542; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 1543; XOP-NEXT: vpcomgtw %xmm1, %xmm3, %xmm4 1544; XOP-NEXT: vpcomgtw %xmm0, %xmm2, %xmm5 1545; XOP-NEXT: vpminsw %xmm1, %xmm3, %xmm6 1546; XOP-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 1547; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1548; XOP-NEXT: vpminsw %xmm0, %xmm2, %xmm6 1549; XOP-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 1550; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 1551; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 1552; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1553; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1554; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1555; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1556; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 1557; XOP-NEXT: vpmacsww %xmm2, %xmm5, %xmm0, %xmm0 1558; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1559; XOP-NEXT: retq 1560; 1561; AVX512F-LABEL: vec256_i16_signed_mem_reg: 1562; AVX512F: # %bb.0: 1563; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 1564; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 1565; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1566; AVX512F-NEXT: vpminsw %ymm0, %ymm1, %ymm3 1567; AVX512F-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 1568; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 1569; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 1570; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1571; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1572; AVX512F-NEXT: retq 1573; 1574; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_mem_reg: 1575; AVX512VL-FALLBACK: # %bb.0: 1576; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1577; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 1578; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm3 1579; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 1580; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1581; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 1582; AVX512VL-FALLBACK-NEXT: vpxor %ymm0, %ymm2, %ymm2 1583; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm2, %ymm0 1584; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1585; AVX512VL-FALLBACK-NEXT: retq 1586; 1587; AVX512BW-FALLBACK-LABEL: vec256_i16_signed_mem_reg: 1588; AVX512BW-FALLBACK: # %bb.0: 1589; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1590; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1591; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 1592; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1593; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1594; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1595; AVX512BW-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 1596; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 1597; AVX512BW-FALLBACK-NEXT: vpsubw %ymm2, %ymm0, %ymm0 1598; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 1599; AVX512BW-FALLBACK-NEXT: vpmullw %ymm3, %ymm0, %ymm0 1600; AVX512BW-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1601; AVX512BW-FALLBACK-NEXT: retq 1602; 1603; AVX512VLBW-LABEL: vec256_i16_signed_mem_reg: 1604; AVX512VLBW: # %bb.0: 1605; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm1 1606; AVX512VLBW-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 1607; AVX512VLBW-NEXT: vpminsw %ymm0, %ymm1, %ymm2 1608; AVX512VLBW-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 1609; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm0, %ymm0 1610; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm0 1611; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1612; AVX512VLBW-NEXT: vpsubw %ymm0, %ymm2, %ymm0 {%k1} 1613; AVX512VLBW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1614; AVX512VLBW-NEXT: retq 1615 %a1 = load <16 x i16>, ptr %a1_addr 1616 %t3 = icmp sgt <16 x i16> %a1, %a2 ; signed 1617 %t4 = select <16 x i1> %t3, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1618 %t5 = select <16 x i1> %t3, <16 x i16> %a2, <16 x i16> %a1 1619 %t6 = select <16 x i1> %t3, <16 x i16> %a1, <16 x i16> %a2 1620 %t7 = sub <16 x i16> %t6, %t5 1621 %t16 = lshr <16 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1622 %t9 = mul nsw <16 x i16> %t16, %t4 ; signed 1623 %a10 = add nsw <16 x i16> %t9, %a1 ; signed 1624 ret <16 x i16> %a10 1625} 1626 1627define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounwind { 1628; AVX1-LABEL: vec256_i16_signed_reg_mem: 1629; AVX1: # %bb.0: 1630; AVX1-NEXT: vmovdqa (%rdi), %xmm1 1631; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 1632; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1633; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4 1634; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5 1635; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6 1636; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1637; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1638; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6 1639; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 1640; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 1641; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 1642; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1643; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1644; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 1645; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 1646; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 1647; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1648; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1649; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1650; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1651; AVX1-NEXT: retq 1652; 1653; AVX2-LABEL: vec256_i16_signed_reg_mem: 1654; AVX2: # %bb.0: 1655; AVX2-NEXT: vmovdqa (%rdi), %ymm1 1656; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1657; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1658; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1659; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1660; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1661; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1662; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1663; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1664; AVX2-NEXT: retq 1665; 1666; XOP-LABEL: vec256_i16_signed_reg_mem: 1667; XOP: # %bb.0: 1668; XOP-NEXT: vmovdqa (%rdi), %xmm1 1669; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 1670; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1671; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4 1672; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5 1673; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6 1674; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 1675; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2 1676; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6 1677; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 1678; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1679; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1680; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 1681; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1682; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1683; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1684; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 1685; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0 1686; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1687; XOP-NEXT: retq 1688; 1689; AVX512F-LABEL: vec256_i16_signed_reg_mem: 1690; AVX512F: # %bb.0: 1691; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 1692; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1693; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1694; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1695; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1696; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1697; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1698; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1699; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1700; AVX512F-NEXT: retq 1701; 1702; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_mem: 1703; AVX512VL-FALLBACK: # %bb.0: 1704; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1705; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1706; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 1707; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 1708; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1709; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 1710; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 1711; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 1712; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1713; AVX512VL-FALLBACK-NEXT: retq 1714; 1715; AVX512BW-FALLBACK-LABEL: vec256_i16_signed_reg_mem: 1716; AVX512BW-FALLBACK: # %bb.0: 1717; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1718; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 1719; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 1720; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1721; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1722; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1723; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1724; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1725; AVX512BW-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1726; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1727; AVX512BW-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1728; AVX512BW-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1729; AVX512BW-FALLBACK-NEXT: retq 1730; 1731; AVX512VLBW-LABEL: vec256_i16_signed_reg_mem: 1732; AVX512VLBW: # %bb.0: 1733; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm1 1734; AVX512VLBW-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 1735; AVX512VLBW-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1736; AVX512VLBW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1737; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1738; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1739; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1740; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm2, %ymm1 {%k1} 1741; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1742; AVX512VLBW-NEXT: retq 1743 %a2 = load <16 x i16>, ptr %a2_addr 1744 %t3 = icmp sgt <16 x i16> %a1, %a2 ; signed 1745 %t4 = select <16 x i1> %t3, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1746 %t5 = select <16 x i1> %t3, <16 x i16> %a2, <16 x i16> %a1 1747 %t6 = select <16 x i1> %t3, <16 x i16> %a1, <16 x i16> %a2 1748 %t7 = sub <16 x i16> %t6, %t5 1749 %t16 = lshr <16 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1750 %t9 = mul nsw <16 x i16> %t16, %t4 ; signed 1751 %a10 = add nsw <16 x i16> %t9, %a1 ; signed 1752 ret <16 x i16> %a10 1753} 1754 1755define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 1756; AVX1-LABEL: vec256_i16_signed_mem_mem: 1757; AVX1: # %bb.0: 1758; AVX1-NEXT: vmovdqa (%rsi), %xmm0 1759; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 1760; AVX1-NEXT: vmovdqa (%rdi), %xmm2 1761; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 1762; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm4 1763; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 1764; AVX1-NEXT: vpminsw %xmm0, %xmm2, %xmm6 1765; AVX1-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 1766; AVX1-NEXT: vpsubw %xmm6, %xmm0, %xmm0 1767; AVX1-NEXT: vpminsw %xmm1, %xmm3, %xmm6 1768; AVX1-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 1769; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1770; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1771; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 1772; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1773; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 1774; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 1775; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 1776; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1777; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 1778; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 1779; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1780; AVX1-NEXT: retq 1781; 1782; AVX2-LABEL: vec256_i16_signed_mem_mem: 1783; AVX2: # %bb.0: 1784; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1785; AVX2-NEXT: vmovdqa (%rsi), %ymm1 1786; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1787; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1788; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1789; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1790; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1791; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1792; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1793; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1794; AVX2-NEXT: retq 1795; 1796; XOP-LABEL: vec256_i16_signed_mem_mem: 1797; XOP: # %bb.0: 1798; XOP-NEXT: vmovdqa (%rsi), %xmm0 1799; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 1800; XOP-NEXT: vmovdqa (%rdi), %xmm2 1801; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 1802; XOP-NEXT: vpcomgtw %xmm1, %xmm3, %xmm4 1803; XOP-NEXT: vpcomgtw %xmm0, %xmm2, %xmm5 1804; XOP-NEXT: vpminsw %xmm1, %xmm3, %xmm6 1805; XOP-NEXT: vpmaxsw %xmm1, %xmm3, %xmm1 1806; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 1807; XOP-NEXT: vpminsw %xmm0, %xmm2, %xmm6 1808; XOP-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 1809; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 1810; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 1811; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1812; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] 1813; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1814; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1815; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 1816; XOP-NEXT: vpmacsww %xmm2, %xmm5, %xmm0, %xmm0 1817; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1818; XOP-NEXT: retq 1819; 1820; AVX512F-LABEL: vec256_i16_signed_mem_mem: 1821; AVX512F: # %bb.0: 1822; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1823; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 1824; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 1825; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1826; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 1827; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1828; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 1829; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1830; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 1831; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1832; AVX512F-NEXT: retq 1833; 1834; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_mem_mem: 1835; AVX512VL-FALLBACK: # %bb.0: 1836; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 1837; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 1838; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1839; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 1840; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 1841; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 1842; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 1843; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 1844; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 1845; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1846; AVX512VL-FALLBACK-NEXT: retq 1847; 1848; AVX512BW-FALLBACK-LABEL: vec256_i16_signed_mem_mem: 1849; AVX512BW-FALLBACK: # %bb.0: 1850; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 1851; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 1852; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 1853; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1854; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1855; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 1856; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1857; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1858; AVX512BW-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1859; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1860; AVX512BW-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1861; AVX512BW-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1862; AVX512BW-FALLBACK-NEXT: retq 1863; 1864; AVX512VLBW-LABEL: vec256_i16_signed_mem_mem: 1865; AVX512VLBW: # %bb.0: 1866; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0 1867; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1 1868; AVX512VLBW-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 1869; AVX512VLBW-NEXT: vpminsw %ymm1, %ymm0, %ymm2 1870; AVX512VLBW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 1871; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 1872; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1873; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1874; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm2, %ymm1 {%k1} 1875; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 1876; AVX512VLBW-NEXT: retq 1877 %a1 = load <16 x i16>, ptr %a1_addr 1878 %a2 = load <16 x i16>, ptr %a2_addr 1879 %t3 = icmp sgt <16 x i16> %a1, %a2 ; signed 1880 %t4 = select <16 x i1> %t3, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1881 %t5 = select <16 x i1> %t3, <16 x i16> %a2, <16 x i16> %a1 1882 %t6 = select <16 x i1> %t3, <16 x i16> %a1, <16 x i16> %a2 1883 %t7 = sub <16 x i16> %t6, %t5 1884 %t16 = lshr <16 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1885 %t9 = mul nsw <16 x i16> %t16, %t4 ; signed 1886 %a10 = add nsw <16 x i16> %t9, %a1 ; signed 1887 ret <16 x i16> %a10 1888} 1889 1890; ---------------------------------------------------------------------------- ; 1891; 8-bit width. 256 / 8 = 32 elts. 1892; ---------------------------------------------------------------------------- ; 1893 1894; Values come from regs 1895 1896define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind { 1897; AVX1-LABEL: vec256_i8_signed_reg_reg: 1898; AVX1: # %bb.0: 1899; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1900; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1901; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 1902; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 1903; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 1904; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 1905; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 1906; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 1907; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 1908; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 1909; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 1910; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1911; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 1912; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1913; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 1914; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1915; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 1916; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 1917; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 1918; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 1919; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 1920; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 1921; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 1922; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 1923; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 1924; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 1925; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 1926; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 1927; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1928; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 1929; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 1930; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 1931; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 1932; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 1933; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1934; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1935; AVX1-NEXT: retq 1936; 1937; AVX2-LABEL: vec256_i8_signed_reg_reg: 1938; AVX2: # %bb.0: 1939; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 1940; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1941; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 1942; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 1943; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 1944; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1945; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1946; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1947; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 1948; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 1949; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 1950; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 1951; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 1952; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 1953; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 1954; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1955; AVX2-NEXT: retq 1956; 1957; XOP-LABEL: vec256_i8_signed_reg_reg: 1958; XOP: # %bb.0: 1959; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1960; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1961; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 1962; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 1963; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 1964; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 1965; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 1966; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 1967; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 1968; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 1969; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 1970; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 1971; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 1972; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1973; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 1974; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 1975; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 1976; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 1977; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 1978; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 1979; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] 1980; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 1981; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 1982; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 1983; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 1984; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 1985; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 1986; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 1987; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 1988; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1989; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1990; XOP-NEXT: retq 1991; 1992; AVX512F-LABEL: vec256_i8_signed_reg_reg: 1993; AVX512F: # %bb.0: 1994; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 1995; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1996; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 1997; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 1998; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 1999; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 2000; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2001; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2002; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 2003; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2004; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 2005; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 2006; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2007; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 2008; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 2009; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2010; AVX512F-NEXT: retq 2011; 2012; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_reg: 2013; AVX512VL-FALLBACK: # %bb.0: 2014; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2015; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 2016; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 2017; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 2018; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 2019; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) 2020; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 2021; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2022; AVX512VL-FALLBACK-NEXT: retq 2023; 2024; AVX512BW-FALLBACK-LABEL: vec256_i8_signed_reg_reg: 2025; AVX512BW-FALLBACK: # %bb.0: 2026; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2027; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2028; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 2029; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 2030; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2031; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2032; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2033; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2034; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2035; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 2036; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2037; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2038; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero 2039; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 2040; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2041; AVX512BW-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2042; AVX512BW-FALLBACK-NEXT: retq 2043; 2044; AVX512VLBW-LABEL: vec256_i8_signed_reg_reg: 2045; AVX512VLBW: # %bb.0: 2046; AVX512VLBW-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 2047; AVX512VLBW-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2048; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2049; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2050; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 2051; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 2052; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2053; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} 2054; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2055; AVX512VLBW-NEXT: retq 2056 %t3 = icmp sgt <32 x i8> %a1, %a2 ; signed 2057 %t4 = select <32 x i1> %t3, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2058 %t5 = select <32 x i1> %t3, <32 x i8> %a2, <32 x i8> %a1 2059 %t6 = select <32 x i1> %t3, <32 x i8> %a1, <32 x i8> %a2 2060 %t7 = sub <32 x i8> %t6, %t5 2061 %t8 = lshr <32 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2062 %t9 = mul nsw <32 x i8> %t8, %t4 ; signed 2063 %a10 = add nsw <32 x i8> %t9, %a1 ; signed 2064 ret <32 x i8> %a10 2065} 2066 2067define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind { 2068; AVX1-LABEL: vec256_i8_unsigned_reg_reg: 2069; AVX1: # %bb.0: 2070; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2071; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2072; AVX1-NEXT: vpminub %xmm3, %xmm2, %xmm4 2073; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 2074; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 2075; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 2076; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm7 2077; AVX1-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm8 2078; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 2079; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2080; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 2081; AVX1-NEXT: vpmaxub %xmm3, %xmm2, %xmm3 2082; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 2083; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 2084; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2085; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2086; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2087; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2088; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2089; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 2090; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2091; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 2092; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 2093; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 2094; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 2095; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1 2096; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2097; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 2098; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 2099; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 2100; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 2101; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 2102; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 2103; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 2104; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 2105; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 2106; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 2107; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2108; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2109; AVX1-NEXT: retq 2110; 2111; AVX2-LABEL: vec256_i8_unsigned_reg_reg: 2112; AVX2: # %bb.0: 2113; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm2 2114; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 2115; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 2116; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 2117; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2118; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 2119; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2120; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2121; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2122; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2123; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 2124; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2125; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 2126; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 2127; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2128; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 2129; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 2130; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2131; AVX2-NEXT: retq 2132; 2133; XOP-LABEL: vec256_i8_unsigned_reg_reg: 2134; XOP: # %bb.0: 2135; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 2136; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 2137; XOP-NEXT: vpcomgtub %xmm2, %xmm3, %xmm4 2138; XOP-NEXT: vpcomgtub %xmm1, %xmm0, %xmm5 2139; XOP-NEXT: vpminub %xmm1, %xmm0, %xmm6 2140; XOP-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 2141; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 2142; XOP-NEXT: vpminub %xmm2, %xmm3, %xmm6 2143; XOP-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 2144; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 2145; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 2146; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 2147; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 2148; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2149; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 2150; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2151; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 2152; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 2153; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 2154; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 2155; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] 2156; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 2157; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 2158; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 2159; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 2160; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 2161; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 2162; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 2163; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 2164; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2165; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2166; XOP-NEXT: retq 2167; 2168; AVX512F-LABEL: vec256_i8_unsigned_reg_reg: 2169; AVX512F: # %bb.0: 2170; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm2 2171; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 2172; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 2173; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2174; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 2175; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2176; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 2177; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2178; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2179; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 2180; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2181; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 2182; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 2183; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2184; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 2185; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 2186; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2187; AVX512F-NEXT: retq 2188; 2189; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg: 2190; AVX512VL-FALLBACK: # %bb.0: 2191; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 2192; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 2193; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2194; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 2195; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 2196; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 2197; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem) 2198; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2199; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2200; AVX512VL-FALLBACK-NEXT: retq 2201; 2202; AVX512BW-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg: 2203; AVX512BW-FALLBACK: # %bb.0: 2204; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2205; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2206; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 2207; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 2208; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2209; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2210; AVX512BW-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 2211; AVX512BW-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 2212; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2213; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 2214; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2215; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2216; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero 2217; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 2218; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2219; AVX512BW-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2220; AVX512BW-FALLBACK-NEXT: retq 2221; 2222; AVX512VLBW-LABEL: vec256_i8_unsigned_reg_reg: 2223; AVX512VLBW: # %bb.0: 2224; AVX512VLBW-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 2225; AVX512VLBW-NEXT: vpminub %ymm1, %ymm0, %ymm2 2226; AVX512VLBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 2227; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2228; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 2229; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 2230; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2231; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} 2232; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2233; AVX512VLBW-NEXT: retq 2234 %t3 = icmp ugt <32 x i8> %a1, %a2 2235 %t4 = select <32 x i1> %t3, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2236 %t5 = select <32 x i1> %t3, <32 x i8> %a2, <32 x i8> %a1 2237 %t6 = select <32 x i1> %t3, <32 x i8> %a1, <32 x i8> %a2 2238 %t7 = sub <32 x i8> %t6, %t5 2239 %t8 = lshr <32 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2240 %t9 = mul <32 x i8> %t8, %t4 2241 %a10 = add <32 x i8> %t9, %a1 2242 ret <32 x i8> %a10 2243} 2244 2245; Values are loaded. Only check signed case. 2246 2247define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind { 2248; AVX1-LABEL: vec256_i8_signed_mem_reg: 2249; AVX1: # %bb.0: 2250; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2251; AVX1-NEXT: vmovdqa (%rdi), %xmm1 2252; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 2253; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 2254; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5 2255; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6 2256; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 2257; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 2258; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 2259; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 2260; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 2261; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 2262; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2263; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 2264; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 2265; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 2266; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2267; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 2268; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2269; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 2270; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 2271; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 2272; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 2273; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 2274; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 2275; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 2276; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 2277; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 2278; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 2279; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 2280; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 2281; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 2282; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 2283; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 2284; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 2285; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2286; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2287; AVX1-NEXT: retq 2288; 2289; AVX2-LABEL: vec256_i8_signed_mem_reg: 2290; AVX2: # %bb.0: 2291; AVX2-NEXT: vmovdqa (%rdi), %ymm1 2292; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 2293; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2294; AVX2-NEXT: vpminsb %ymm0, %ymm1, %ymm3 2295; AVX2-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 2296; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 2297; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 2298; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2299; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2300; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 2301; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 2302; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 2303; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 2304; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 2305; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 2306; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2307; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2308; AVX2-NEXT: retq 2309; 2310; XOP-LABEL: vec256_i8_signed_mem_reg: 2311; XOP: # %bb.0: 2312; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 2313; XOP-NEXT: vmovdqa (%rdi), %xmm2 2314; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 2315; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 2316; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 2317; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 2318; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 2319; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 2320; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 2321; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 2322; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 2323; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 2324; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 2325; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 2326; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2327; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 2328; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2329; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 2330; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 2331; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 2332; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 2333; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] 2334; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 2335; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 2336; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 2337; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 2338; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 2339; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 2340; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 2341; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 2342; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 2343; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2344; XOP-NEXT: retq 2345; 2346; AVX512F-LABEL: vec256_i8_signed_mem_reg: 2347; AVX512F: # %bb.0: 2348; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 2349; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 2350; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2351; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm3 2352; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 2353; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 2354; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 2355; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2356; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2357; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 2358; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 2359; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 2360; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 2361; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 2362; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 2363; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 2364; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2365; AVX512F-NEXT: retq 2366; 2367; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_mem_reg: 2368; AVX512VL-FALLBACK: # %bb.0: 2369; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 2370; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 2371; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm3 2372; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 2373; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 2374; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 2375; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem) 2376; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 2377; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2378; AVX512VL-FALLBACK-NEXT: retq 2379; 2380; AVX512BW-FALLBACK-LABEL: vec256_i8_signed_mem_reg: 2381; AVX512BW-FALLBACK: # %bb.0: 2382; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2383; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 2384; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 2385; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 2386; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2387; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2388; AVX512BW-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 2389; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 2390; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm0, %ymm0 2391; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 2392; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2393; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2394; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero 2395; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm0, %zmm0 2396; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm0, %ymm0 2397; AVX512BW-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2398; AVX512BW-FALLBACK-NEXT: retq 2399; 2400; AVX512VLBW-LABEL: vec256_i8_signed_mem_reg: 2401; AVX512VLBW: # %bb.0: 2402; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm1 2403; AVX512VLBW-NEXT: vpcmpgtb %ymm0, %ymm1, %k1 2404; AVX512VLBW-NEXT: vpminsb %ymm0, %ymm1, %ymm2 2405; AVX512VLBW-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 2406; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 2407; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm0 2408; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 2409; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2410; AVX512VLBW-NEXT: vpsubb %ymm0, %ymm2, %ymm0 {%k1} 2411; AVX512VLBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2412; AVX512VLBW-NEXT: retq 2413 %a1 = load <32 x i8>, ptr %a1_addr 2414 %t3 = icmp sgt <32 x i8> %a1, %a2 ; signed 2415 %t4 = select <32 x i1> %t3, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2416 %t5 = select <32 x i1> %t3, <32 x i8> %a2, <32 x i8> %a1 2417 %t6 = select <32 x i1> %t3, <32 x i8> %a1, <32 x i8> %a2 2418 %t7 = sub <32 x i8> %t6, %t5 2419 %t8 = lshr <32 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2420 %t9 = mul nsw <32 x i8> %t8, %t4 ; signed 2421 %a10 = add nsw <32 x i8> %t9, %a1 ; signed 2422 ret <32 x i8> %a10 2423} 2424 2425define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind { 2426; AVX1-LABEL: vec256_i8_signed_reg_mem: 2427; AVX1: # %bb.0: 2428; AVX1-NEXT: vmovdqa (%rdi), %xmm2 2429; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 2430; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2431; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 2432; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 2433; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 2434; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 2435; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 2436; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm6 2437; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 2438; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 2439; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 2440; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2441; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 2442; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 2443; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 2444; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2445; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 2446; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2447; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 2448; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 2449; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 2450; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 2451; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 2452; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 2453; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 2454; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 2455; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 2456; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 2457; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 2458; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 2459; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 2460; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 2461; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 2462; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 2463; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 2464; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2465; AVX1-NEXT: retq 2466; 2467; AVX2-LABEL: vec256_i8_signed_reg_mem: 2468; AVX2: # %bb.0: 2469; AVX2-NEXT: vmovdqa (%rdi), %ymm1 2470; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 2471; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2472; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 2473; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2474; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 2475; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2476; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2477; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2478; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 2479; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2480; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 2481; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 2482; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2483; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 2484; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 2485; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2486; AVX2-NEXT: retq 2487; 2488; XOP-LABEL: vec256_i8_signed_reg_mem: 2489; XOP: # %bb.0: 2490; XOP-NEXT: vmovdqa (%rdi), %xmm1 2491; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 2492; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 2493; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 2494; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 2495; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6 2496; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2497; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 2498; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 2499; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 2500; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 2501; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 2502; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 2503; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 2504; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2505; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 2506; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2507; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 2508; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 2509; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 2510; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 2511; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] 2512; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 2513; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 2514; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 2515; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 2516; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 2517; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 2518; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 2519; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 2520; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2521; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2522; XOP-NEXT: retq 2523; 2524; AVX512F-LABEL: vec256_i8_signed_reg_mem: 2525; AVX512F: # %bb.0: 2526; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 2527; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 2528; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2529; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 2530; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2531; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 2532; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 2533; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2534; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2535; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 2536; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2537; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 2538; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 2539; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2540; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 2541; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 2542; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2543; AVX512F-NEXT: retq 2544; 2545; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_mem: 2546; AVX512VL-FALLBACK: # %bb.0: 2547; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 2548; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2549; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 2550; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 2551; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 2552; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 2553; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) 2554; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 2555; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2556; AVX512VL-FALLBACK-NEXT: retq 2557; 2558; AVX512BW-FALLBACK-LABEL: vec256_i8_signed_reg_mem: 2559; AVX512BW-FALLBACK: # %bb.0: 2560; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2561; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 2562; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 2563; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 2564; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2565; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2566; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2567; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2568; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2569; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 2570; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2571; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2572; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero 2573; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 2574; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2575; AVX512BW-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2576; AVX512BW-FALLBACK-NEXT: retq 2577; 2578; AVX512VLBW-LABEL: vec256_i8_signed_reg_mem: 2579; AVX512VLBW: # %bb.0: 2580; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm1 2581; AVX512VLBW-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 2582; AVX512VLBW-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2583; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2584; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2585; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 2586; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 2587; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2588; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} 2589; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2590; AVX512VLBW-NEXT: retq 2591 %a2 = load <32 x i8>, ptr %a2_addr 2592 %t3 = icmp sgt <32 x i8> %a1, %a2 ; signed 2593 %t4 = select <32 x i1> %t3, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2594 %t5 = select <32 x i1> %t3, <32 x i8> %a2, <32 x i8> %a1 2595 %t6 = select <32 x i1> %t3, <32 x i8> %a1, <32 x i8> %a2 2596 %t7 = sub <32 x i8> %t6, %t5 2597 %t8 = lshr <32 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2598 %t9 = mul nsw <32 x i8> %t8, %t4 ; signed 2599 %a10 = add nsw <32 x i8> %t9, %a1 ; signed 2600 ret <32 x i8> %a10 2601} 2602 2603define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 2604; AVX1-LABEL: vec256_i8_signed_mem_mem: 2605; AVX1: # %bb.0: 2606; AVX1-NEXT: vmovdqa (%rsi), %xmm1 2607; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 2608; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2609; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 2610; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 2611; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 2612; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 2613; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 2614; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 2615; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 2616; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 2617; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 2618; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 2619; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2620; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 2621; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2622; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 2623; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2624; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 2625; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2626; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 2627; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 2628; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 2629; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 2630; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 2631; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2632; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 2633; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 2634; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 2635; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 2636; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 2637; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 2638; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 2639; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 2640; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2641; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 2642; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2643; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2644; AVX1-NEXT: retq 2645; 2646; AVX2-LABEL: vec256_i8_signed_mem_mem: 2647; AVX2: # %bb.0: 2648; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2649; AVX2-NEXT: vmovdqa (%rsi), %ymm1 2650; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 2651; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2652; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 2653; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2654; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 2655; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2656; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2657; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2658; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 2659; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2660; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 2661; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 2662; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2663; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 2664; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 2665; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2666; AVX2-NEXT: retq 2667; 2668; XOP-LABEL: vec256_i8_signed_mem_mem: 2669; XOP: # %bb.0: 2670; XOP-NEXT: vmovdqa (%rsi), %xmm0 2671; XOP-NEXT: vmovdqa 16(%rsi), %xmm1 2672; XOP-NEXT: vmovdqa (%rdi), %xmm2 2673; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 2674; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 2675; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 2676; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 2677; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 2678; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 2679; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 2680; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 2681; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 2682; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 2683; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 2684; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 2685; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2686; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 2687; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2688; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 2689; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 2690; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 2691; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 2692; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] 2693; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 2694; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 2695; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 2696; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 2697; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 2698; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 2699; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 2700; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 2701; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 2702; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2703; XOP-NEXT: retq 2704; 2705; AVX512F-LABEL: vec256_i8_signed_mem_mem: 2706; AVX512F: # %bb.0: 2707; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2708; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 2709; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 2710; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2711; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 2712; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2713; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 2714; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 2715; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2716; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2717; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 2718; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 2719; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 2720; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 2721; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 2722; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 2723; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 2724; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2725; AVX512F-NEXT: retq 2726; 2727; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_mem_mem: 2728; AVX512VL-FALLBACK: # %bb.0: 2729; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 2730; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 2731; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2732; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 2733; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 2734; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 2735; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 2736; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) 2737; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 2738; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2739; AVX512VL-FALLBACK-NEXT: retq 2740; 2741; AVX512BW-FALLBACK-LABEL: vec256_i8_signed_mem_mem: 2742; AVX512BW-FALLBACK: # %bb.0: 2743; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 2744; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 2745; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 2746; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 2747; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2748; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 2749; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2750; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2751; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2752; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 2753; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2754; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2755; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero 2756; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 2757; AVX512BW-FALLBACK-NEXT: vpmovwb %zmm1, %ymm1 2758; AVX512BW-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2759; AVX512BW-FALLBACK-NEXT: retq 2760; 2761; AVX512VLBW-LABEL: vec256_i8_signed_mem_mem: 2762; AVX512VLBW: # %bb.0: 2763; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0 2764; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1 2765; AVX512VLBW-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 2766; AVX512VLBW-NEXT: vpminsb %ymm1, %ymm0, %ymm2 2767; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 2768; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 2769; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 2770; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 2771; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2772; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} 2773; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 2774; AVX512VLBW-NEXT: retq 2775 %a1 = load <32 x i8>, ptr %a1_addr 2776 %a2 = load <32 x i8>, ptr %a2_addr 2777 %t3 = icmp sgt <32 x i8> %a1, %a2 ; signed 2778 %t4 = select <32 x i1> %t3, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2779 %t5 = select <32 x i1> %t3, <32 x i8> %a2, <32 x i8> %a1 2780 %t6 = select <32 x i1> %t3, <32 x i8> %a1, <32 x i8> %a2 2781 %t7 = sub <32 x i8> %t6, %t5 2782 %t8 = lshr <32 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2783 %t9 = mul nsw <32 x i8> %t8, %t4 ; signed 2784 %a10 = add nsw <32 x i8> %t9, %a1 ; signed 2785 ret <32 x i8> %a10 2786} 2787