1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL-FALLBACK 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 6 7; These test cases are inspired by C++2a std::midpoint(). 8; See https://bugs.llvm.org/show_bug.cgi?id=40965 9 10; Using 512-bit vector regs. 11 12; ---------------------------------------------------------------------------- ; 13; 32-bit width. 512 / 32 = 16 elts. 14; ---------------------------------------------------------------------------- ; 15 16; Values come from regs 17 18define <16 x i32> @vec512_i32_signed_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind { 19; ALL-LABEL: vec512_i32_signed_reg_reg: 20; ALL: # %bb.0: 21; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 22; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 23; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 24; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 25; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 26; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 27; ALL-NEXT: retq 28 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 29 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 30 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 31 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 32 %t7 = sub <16 x i32> %t6, %t5 33 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 34 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 35 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 36 ret <16 x i32> %a10 37} 38 39define <16 x i32> @vec512_i32_unsigned_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind { 40; ALL-LABEL: vec512_i32_unsigned_reg_reg: 41; ALL: # %bb.0: 42; ALL-NEXT: vpminud %zmm1, %zmm0, %zmm2 43; ALL-NEXT: vpmaxud %zmm1, %zmm0, %zmm1 44; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 45; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 46; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 47; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 48; ALL-NEXT: retq 49 %t3 = icmp ugt <16 x i32> %a1, %a2 50 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 51 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 52 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 53 %t7 = sub <16 x i32> %t6, %t5 54 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 55 %t9 = mul <16 x i32> %t16, %t16 56 %a10 = add <16 x i32> %t9, %a1 57 ret <16 x i32> %a10 58} 59 60; Values are loaded. Only check signed case. 61 62define <16 x i32> @vec512_i32_signed_mem_reg(ptr %a1_addr, <16 x i32> %a2) nounwind { 63; ALL-LABEL: vec512_i32_signed_mem_reg: 64; ALL: # %bb.0: 65; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 66; ALL-NEXT: vpminsd %zmm0, %zmm1, %zmm2 67; ALL-NEXT: vpmaxsd %zmm0, %zmm1, %zmm0 68; ALL-NEXT: vpsubd %zmm2, %zmm0, %zmm0 69; ALL-NEXT: vpsrld $1, %zmm0, %zmm0 70; ALL-NEXT: vpmulld %zmm0, %zmm0, %zmm0 71; ALL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 72; ALL-NEXT: retq 73 %a1 = load <16 x i32>, ptr %a1_addr 74 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 75 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 76 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 77 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 78 %t7 = sub <16 x i32> %t6, %t5 79 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 80 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 81 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 82 ret <16 x i32> %a10 83} 84 85define <16 x i32> @vec512_i32_signed_reg_mem(<16 x i32> %a1, ptr %a2_addr) nounwind { 86; ALL-LABEL: vec512_i32_signed_reg_mem: 87; ALL: # %bb.0: 88; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 89; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 90; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 91; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 92; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 93; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 94; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 95; ALL-NEXT: retq 96 %a2 = load <16 x i32>, ptr %a2_addr 97 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 98 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 99 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 100 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 101 %t7 = sub <16 x i32> %t6, %t5 102 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 103 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 104 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 105 ret <16 x i32> %a10 106} 107 108define <16 x i32> @vec512_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 109; ALL-LABEL: vec512_i32_signed_mem_mem: 110; ALL: # %bb.0: 111; ALL-NEXT: vmovdqa64 (%rdi), %zmm0 112; ALL-NEXT: vmovdqa64 (%rsi), %zmm1 113; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2 114; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1 115; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1 116; ALL-NEXT: vpsrld $1, %zmm1, %zmm1 117; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1 118; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 119; ALL-NEXT: retq 120 %a1 = load <16 x i32>, ptr %a1_addr 121 %a2 = load <16 x i32>, ptr %a2_addr 122 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed 123 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 124 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1 125 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2 126 %t7 = sub <16 x i32> %t6, %t5 127 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 128 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed 129 %a10 = add nsw <16 x i32> %t9, %a1 ; signed 130 ret <16 x i32> %a10 131} 132 133; ---------------------------------------------------------------------------- ; 134; 64-bit width. 512 / 64 = 8 elts. 135; ---------------------------------------------------------------------------- ; 136 137; Values come from regs 138 139define <8 x i64> @vec512_i64_signed_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind { 140; ALL-LABEL: vec512_i64_signed_reg_reg: 141; ALL: # %bb.0: 142; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 143; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 144; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 145; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 146; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1 147; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 148; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1} 149; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 150; ALL-NEXT: retq 151 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 152 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 153 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 154 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 155 %t7 = sub <8 x i64> %t6, %t5 156 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 157 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 158 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 159 ret <8 x i64> %a10 160} 161 162define <8 x i64> @vec512_i64_unsigned_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind { 163; ALL-LABEL: vec512_i64_unsigned_reg_reg: 164; ALL: # %bb.0: 165; ALL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 166; ALL-NEXT: vpminuq %zmm1, %zmm0, %zmm2 167; ALL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 168; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 169; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1 170; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 171; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1} 172; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 173; ALL-NEXT: retq 174 %t3 = icmp ugt <8 x i64> %a1, %a2 175 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 176 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 177 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 178 %t7 = sub <8 x i64> %t6, %t5 179 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 180 %t9 = mul <8 x i64> %t8, %t4 181 %a10 = add <8 x i64> %t9, %a1 182 ret <8 x i64> %a10 183} 184 185; Values are loaded. Only check signed case. 186 187define <8 x i64> @vec512_i64_signed_mem_reg(ptr %a1_addr, <8 x i64> %a2) nounwind { 188; ALL-LABEL: vec512_i64_signed_mem_reg: 189; ALL: # %bb.0: 190; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 191; ALL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 192; ALL-NEXT: vpminsq %zmm0, %zmm1, %zmm2 193; ALL-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 194; ALL-NEXT: vpsubq %zmm2, %zmm0, %zmm0 195; ALL-NEXT: vpsrlq $1, %zmm0, %zmm0 196; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 197; ALL-NEXT: vpsubq %zmm0, %zmm2, %zmm0 {%k1} 198; ALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 199; ALL-NEXT: retq 200 %a1 = load <8 x i64>, ptr %a1_addr 201 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 202 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 203 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 204 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 205 %t7 = sub <8 x i64> %t6, %t5 206 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 207 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 208 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 209 ret <8 x i64> %a10 210} 211 212define <8 x i64> @vec512_i64_signed_reg_mem(<8 x i64> %a1, ptr %a2_addr) nounwind { 213; ALL-LABEL: vec512_i64_signed_reg_mem: 214; ALL: # %bb.0: 215; ALL-NEXT: vmovdqa64 (%rdi), %zmm1 216; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 217; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 218; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 219; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 220; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1 221; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 222; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1} 223; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 224; ALL-NEXT: retq 225 %a2 = load <8 x i64>, ptr %a2_addr 226 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 227 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 228 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 229 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 230 %t7 = sub <8 x i64> %t6, %t5 231 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 232 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 233 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 234 ret <8 x i64> %a10 235} 236 237define <8 x i64> @vec512_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 238; ALL-LABEL: vec512_i64_signed_mem_mem: 239; ALL: # %bb.0: 240; ALL-NEXT: vmovdqa64 (%rdi), %zmm0 241; ALL-NEXT: vmovdqa64 (%rsi), %zmm1 242; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 243; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2 244; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 245; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1 246; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1 247; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 248; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1} 249; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 250; ALL-NEXT: retq 251 %a1 = load <8 x i64>, ptr %a1_addr 252 %a2 = load <8 x i64>, ptr %a2_addr 253 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed 254 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 255 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1 256 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2 257 %t7 = sub <8 x i64> %t6, %t5 258 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 259 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed 260 %a10 = add nsw <8 x i64> %t9, %a1 ; signed 261 ret <8 x i64> %a10 262} 263 264; ---------------------------------------------------------------------------- ; 265; 16-bit width. 512 / 16 = 32 elts. 266; ---------------------------------------------------------------------------- ; 267 268; Values come from regs 269 270define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { 271; AVX512F-LABEL: vec512_i16_signed_reg_reg: 272; AVX512F: # %bb.0: 273; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 274; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 275; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 276; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 277; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 278; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 279; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 280; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 281; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 282; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 283; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 284; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 285; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 286; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 287; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 288; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 289; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 290; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 291; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 292; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 293; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 294; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 295; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 296; AVX512F-NEXT: retq 297; 298; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg: 299; AVX512VL-FALLBACK: # %bb.0: 300; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 301; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 302; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 303; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 304; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 305; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 306; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 307; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 308; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 309; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 310; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 311; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 312; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 313; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 314; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 315; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 316; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 317; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 318; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 319; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 320; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 321; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 322; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 323; AVX512VL-FALLBACK-NEXT: retq 324; 325; AVX512BW-LABEL: vec512_i16_signed_reg_reg: 326; AVX512BW: # %bb.0: 327; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 328; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 329; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 330; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 331; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 332; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 333; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1} 334; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 335; AVX512BW-NEXT: retq 336 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 337 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 338 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 339 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 340 %t7 = sub <32 x i16> %t6, %t5 341 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 342 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 343 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 344 ret <32 x i16> %a10 345} 346 347define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { 348; AVX512F-LABEL: vec512_i16_unsigned_reg_reg: 349; AVX512F: # %bb.0: 350; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 351; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 352; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4 353; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 354; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6 355; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 356; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 357; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 358; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 359; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 360; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 361; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 362; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 363; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 364; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 365; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 366; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 367; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 368; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) 369; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 370; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 371; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 372; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 373; AVX512F-NEXT: retq 374; 375; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg: 376; AVX512VL-FALLBACK: # %bb.0: 377; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 378; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 379; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 380; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 381; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6 382; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 383; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 384; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 385; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 386; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 387; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 388; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 389; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 390; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 391; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 392; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 393; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 394; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 395; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) 396; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 397; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 398; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 399; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 400; AVX512VL-FALLBACK-NEXT: retq 401; 402; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg: 403; AVX512BW: # %bb.0: 404; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 405; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 406; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm1 407; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 408; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 409; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 410; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1} 411; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 412; AVX512BW-NEXT: retq 413 %t3 = icmp ugt <32 x i16> %a1, %a2 414 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 415 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 416 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 417 %t7 = sub <32 x i16> %t6, %t5 418 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 419 %t9 = mul <32 x i16> %t16, %t4 420 %a10 = add <32 x i16> %t9, %a1 421 ret <32 x i16> %a10 422} 423 424; Values are loaded. Only check signed case. 425 426define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounwind { 427; AVX512F-LABEL: vec512_i16_signed_mem_reg: 428; AVX512F: # %bb.0: 429; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 430; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 431; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 432; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 433; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 434; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 435; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 436; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 437; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 438; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 439; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 440; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 441; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 442; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 443; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 444; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 445; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 446; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 447; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 448; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 449; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 450; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 451; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 452; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 453; AVX512F-NEXT: retq 454; 455; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg: 456; AVX512VL-FALLBACK: # %bb.0: 457; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 458; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 459; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 460; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 461; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 462; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 463; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 464; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 465; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 466; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 467; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 468; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 469; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 470; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 471; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 472; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 473; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 474; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 475; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 476; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 477; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 478; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 479; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 480; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 481; AVX512VL-FALLBACK-NEXT: retq 482; 483; AVX512BW-LABEL: vec512_i16_signed_mem_reg: 484; AVX512BW: # %bb.0: 485; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 486; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 487; AVX512BW-NEXT: vpminsw %zmm0, %zmm1, %zmm2 488; AVX512BW-NEXT: vpmaxsw %zmm0, %zmm1, %zmm0 489; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 490; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 491; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 492; AVX512BW-NEXT: vpsubw %zmm0, %zmm2, %zmm0 {%k1} 493; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 494; AVX512BW-NEXT: retq 495 %a1 = load <32 x i16>, ptr %a1_addr 496 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 497 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 498 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 499 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 500 %t7 = sub <32 x i16> %t6, %t5 501 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 502 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 503 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 504 ret <32 x i16> %a10 505} 506 507define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind { 508; AVX512F-LABEL: vec512_i16_signed_reg_mem: 509; AVX512F: # %bb.0: 510; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 511; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 512; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 513; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 514; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 515; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 516; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 517; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 518; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 519; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 520; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 521; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 522; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 523; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 524; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 525; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 526; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 527; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 528; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 529; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 530; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 531; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 532; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 533; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 534; AVX512F-NEXT: retq 535; 536; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: 537; AVX512VL-FALLBACK: # %bb.0: 538; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 539; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 540; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 541; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 542; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 543; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 544; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 545; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 546; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 547; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 548; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 549; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 550; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 551; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 552; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 553; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 554; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 555; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 556; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 557; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 558; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 559; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 560; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 561; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 562; AVX512VL-FALLBACK-NEXT: retq 563; 564; AVX512BW-LABEL: vec512_i16_signed_reg_mem: 565; AVX512BW: # %bb.0: 566; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 567; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 568; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 569; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 570; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 571; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 572; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 573; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1} 574; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 575; AVX512BW-NEXT: retq 576 %a2 = load <32 x i16>, ptr %a2_addr 577 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 578 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 579 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 580 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 581 %t7 = sub <32 x i16> %t6, %t5 582 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 583 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 584 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 585 ret <32 x i16> %a10 586} 587 588define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 589; AVX512F-LABEL: vec512_i16_signed_mem_mem: 590; AVX512F: # %bb.0: 591; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 592; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 593; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 594; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 595; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 596; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 597; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 598; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 599; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 600; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 601; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 602; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 603; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 604; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 605; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 606; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 607; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 608; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 609; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 610; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 611; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 612; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 613; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 614; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 615; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 616; AVX512F-NEXT: retq 617; 618; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: 619; AVX512VL-FALLBACK: # %bb.0: 620; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 621; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 622; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 623; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 624; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 625; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 626; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 627; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 628; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 629; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 630; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 631; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 632; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 633; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 634; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 635; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 636; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 637; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 638; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 639; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 640; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 641; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 642; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 643; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 644; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 645; AVX512VL-FALLBACK-NEXT: retq 646; 647; AVX512BW-LABEL: vec512_i16_signed_mem_mem: 648; AVX512BW: # %bb.0: 649; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 650; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 651; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 652; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 653; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1 654; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 655; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 656; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 657; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1} 658; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 659; AVX512BW-NEXT: retq 660 %a1 = load <32 x i16>, ptr %a1_addr 661 %a2 = load <32 x i16>, ptr %a2_addr 662 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed 663 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 664 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1 665 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2 666 %t7 = sub <32 x i16> %t6, %t5 667 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 668 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed 669 %a10 = add nsw <32 x i16> %t9, %a1 ; signed 670 ret <32 x i16> %a10 671} 672 673; ---------------------------------------------------------------------------- ; 674; 8-bit width. 512 / 8 = 64 elts. 675; ---------------------------------------------------------------------------- ; 676 677; Values come from regs 678 679define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { 680; AVX512F-LABEL: vec512_i8_signed_reg_reg: 681; AVX512F: # %bb.0: 682; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 683; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 684; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 685; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 686; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 687; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 688; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 689; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 690; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 691; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 692; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 693; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 694; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 695; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 696; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 697; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 698; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 699; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 700; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 701; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 702; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 703; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 704; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 705; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 706; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 707; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 708; AVX512F-NEXT: retq 709; 710; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: 711; AVX512VL-FALLBACK: # %bb.0: 712; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 713; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 714; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 715; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 716; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 717; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 718; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 719; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 720; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 721; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 722; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 723; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 724; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 725; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2 726; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 727; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 728; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 729; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 730; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 731; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 732; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 733; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 734; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 735; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 736; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 737; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 738; AVX512VL-FALLBACK-NEXT: retq 739; 740; AVX512BW-LABEL: vec512_i8_signed_reg_reg: 741; AVX512BW: # %bb.0: 742; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 743; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 744; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 745; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 746; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 747; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 748; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 749; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} 750; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 751; AVX512BW-NEXT: retq 752 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 753 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 754 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 755 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 756 %t7 = sub <64 x i8> %t6, %t5 757 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 758 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 759 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 760 ret <64 x i8> %a10 761} 762 763define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { 764; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: 765; AVX512F: # %bb.0: 766; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 767; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 768; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4 769; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 770; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 771; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 772; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 773; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 774; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 775; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 776; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 777; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 778; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 779; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 780; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 781; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 782; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 783; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 784; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 785; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 786; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 787; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) 788; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 789; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 790; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 791; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 792; AVX512F-NEXT: retq 793; 794; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: 795; AVX512VL-FALLBACK: # %bb.0: 796; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 797; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 798; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4 799; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 800; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 801; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 802; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 803; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 804; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 805; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 806; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 807; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 808; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 809; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 810; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 811; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 812; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 813; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 814; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 815; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 816; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 817; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) 818; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 819; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 820; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 821; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 822; AVX512VL-FALLBACK-NEXT: retq 823; 824; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg: 825; AVX512BW: # %bb.0: 826; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 827; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 828; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm1 829; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 830; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 831; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 832; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 833; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} 834; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 835; AVX512BW-NEXT: retq 836 %t3 = icmp ugt <64 x i8> %a1, %a2 837 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 838 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 839 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 840 %t7 = sub <64 x i8> %t6, %t5 841 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 842 %t9 = mul <64 x i8> %t8, %t4 843 %a10 = add <64 x i8> %t9, %a1 844 ret <64 x i8> %a10 845} 846 847; Values are loaded. Only check signed case. 848 849define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { 850; AVX512F-LABEL: vec512_i8_signed_mem_reg: 851; AVX512F: # %bb.0: 852; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 853; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 854; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 855; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 856; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 857; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 858; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 859; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 860; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 861; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 862; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 863; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 864; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 865; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 866; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 867; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 868; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 869; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 870; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 871; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 872; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 873; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 874; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 875; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 876; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 877; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 878; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 879; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 880; AVX512F-NEXT: retq 881; 882; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: 883; AVX512VL-FALLBACK: # %bb.0: 884; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 885; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 886; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 887; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 888; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 889; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 890; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 891; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 892; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 893; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 894; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 895; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 896; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 897; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 898; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 899; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 900; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 901; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 902; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 903; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 904; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 905; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 906; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 907; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 908; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 909; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 910; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 911; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 912; AVX512VL-FALLBACK-NEXT: retq 913; 914; AVX512BW-LABEL: vec512_i8_signed_mem_reg: 915; AVX512BW: # %bb.0: 916; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 917; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 918; AVX512BW-NEXT: vpminsb %zmm0, %zmm1, %zmm2 919; AVX512BW-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0 920; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 921; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 922; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 923; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 924; AVX512BW-NEXT: vpsubb %zmm0, %zmm2, %zmm0 {%k1} 925; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 926; AVX512BW-NEXT: retq 927 %a1 = load <64 x i8>, ptr %a1_addr 928 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 929 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 930 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 931 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 932 %t7 = sub <64 x i8> %t6, %t5 933 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 934 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 935 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 936 ret <64 x i8> %a10 937} 938 939define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { 940; AVX512F-LABEL: vec512_i8_signed_reg_mem: 941; AVX512F: # %bb.0: 942; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 943; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 944; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 945; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 946; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 947; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 948; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 949; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 950; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 951; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 952; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 953; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 954; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 955; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 956; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 957; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 958; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 959; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 960; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 961; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 962; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 963; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 964; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 965; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 966; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 967; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 968; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 969; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 970; AVX512F-NEXT: retq 971; 972; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: 973; AVX512VL-FALLBACK: # %bb.0: 974; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 975; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 976; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 977; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 978; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 979; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 980; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 981; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 982; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 983; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 984; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 985; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 986; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 987; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 988; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 989; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 990; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 991; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 992; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 993; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 994; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 995; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 996; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 997; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) 998; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 999; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 1000; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1001; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1002; AVX512VL-FALLBACK-NEXT: retq 1003; 1004; AVX512BW-LABEL: vec512_i8_signed_reg_mem: 1005; AVX512BW: # %bb.0: 1006; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 1007; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 1008; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 1009; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 1010; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 1011; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 1012; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 1013; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1014; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} 1015; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 1016; AVX512BW-NEXT: retq 1017 %a2 = load <64 x i8>, ptr %a2_addr 1018 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 1019 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1020 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 1021 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 1022 %t7 = sub <64 x i8> %t6, %t5 1023 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1024 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 1025 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 1026 ret <64 x i8> %a10 1027} 1028 1029define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { 1030; AVX512F-LABEL: vec512_i8_signed_mem_mem: 1031; AVX512F: # %bb.0: 1032; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 1033; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 1034; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 1035; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 1036; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 1037; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 1038; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 1039; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 1040; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 1041; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 1042; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 1043; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 1044; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 1045; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 1046; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1047; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 1048; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1049; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 1050; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 1051; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 1052; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 1053; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 1054; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 1055; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1056; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 1057; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1058; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 1059; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1060; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1061; AVX512F-NEXT: retq 1062; 1063; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: 1064; AVX512VL-FALLBACK: # %bb.0: 1065; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 1066; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 1067; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 1068; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 1069; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 1070; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 1071; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 1072; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 1073; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 1074; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 1075; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 1076; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 1077; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 1078; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 1079; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 1080; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 1081; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1082; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 1083; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 1084; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 1085; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 1086; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 1087; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 1088; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1089; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) 1090; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1091; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 1092; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1093; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1094; AVX512VL-FALLBACK-NEXT: retq 1095; 1096; AVX512BW-LABEL: vec512_i8_signed_mem_mem: 1097; AVX512BW: # %bb.0: 1098; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1099; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 1100; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 1101; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 1102; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 1103; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 1104; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 1105; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 1106; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1107; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} 1108; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 1109; AVX512BW-NEXT: retq 1110 %a1 = load <64 x i8>, ptr %a1_addr 1111 %a2 = load <64 x i8>, ptr %a2_addr 1112 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed 1113 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1114 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1 1115 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2 1116 %t7 = sub <64 x i8> %t6, %t5 1117 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1118 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed 1119 %a10 = add nsw <64 x i8> %t9, %a1 ; signed 1120 ret <64 x i8> %a10 1121} 1122