1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 4 5; 6; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b) 7; 8 9define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 10; AVX512BW-LABEL: abd_ext_v64i8: 11; AVX512BW: # %bb.0: 12; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 13; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 14; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 15; AVX512BW-NEXT: retq 16; 17; AVX512DQ-LABEL: abd_ext_v64i8: 18; AVX512DQ: # %bb.0: 19; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 20; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 21; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4 22; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 23; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 24; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3 25; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 26; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 27; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 28; AVX512DQ-NEXT: retq 29 %aext = zext <64 x i8> %a to <64 x i64> 30 %bext = zext <64 x i8> %b to <64 x i64> 31 %sub = sub <64 x i64> %aext, %bext 32 %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 false) 33 %trunc = trunc <64 x i64> %abs to <64 x i8> 34 ret <64 x i8> %trunc 35} 36 37define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind { 38; AVX512BW-LABEL: abd_ext_v64i8_undef: 39; AVX512BW: # %bb.0: 40; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 41; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 42; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 43; AVX512BW-NEXT: retq 44; 45; AVX512DQ-LABEL: abd_ext_v64i8_undef: 46; AVX512DQ: # %bb.0: 47; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 48; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 49; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4 50; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 51; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 52; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3 53; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 54; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 55; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 56; AVX512DQ-NEXT: retq 57 %aext = zext <64 x i8> %a to <64 x i64> 58 %bext = zext <64 x i8> %b to <64 x i64> 59 %sub = sub <64 x i64> %aext, %bext 60 %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true) 61 %trunc = trunc <64 x i64> %abs to <64 x i8> 62 ret <64 x i8> %trunc 63} 64 65define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 66; AVX512BW-LABEL: abd_ext_v32i16: 67; AVX512BW: # %bb.0: 68; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 69; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 70; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 71; AVX512BW-NEXT: retq 72; 73; AVX512DQ-LABEL: abd_ext_v32i16: 74; AVX512DQ: # %bb.0: 75; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 76; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 77; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4 78; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 79; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2 80; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3 81; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 82; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 83; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 84; AVX512DQ-NEXT: retq 85 %aext = zext <32 x i16> %a to <32 x i64> 86 %bext = zext <32 x i16> %b to <32 x i64> 87 %sub = sub <32 x i64> %aext, %bext 88 %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false) 89 %trunc = trunc <32 x i64> %abs to <32 x i16> 90 ret <32 x i16> %trunc 91} 92 93define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind { 94; AVX512BW-LABEL: abd_ext_v32i16_undef: 95; AVX512BW: # %bb.0: 96; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 97; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 98; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 99; AVX512BW-NEXT: retq 100; 101; AVX512DQ-LABEL: abd_ext_v32i16_undef: 102; AVX512DQ: # %bb.0: 103; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 104; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 105; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4 106; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 107; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2 108; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3 109; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 110; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 111; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 112; AVX512DQ-NEXT: retq 113 %aext = zext <32 x i16> %a to <32 x i64> 114 %bext = zext <32 x i16> %b to <32 x i64> 115 %sub = sub <32 x i64> %aext, %bext 116 %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) 117 %trunc = trunc <32 x i64> %abs to <32 x i16> 118 ret <32 x i16> %trunc 119} 120 121define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 122; AVX512-LABEL: abd_ext_v16i32: 123; AVX512: # %bb.0: 124; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2 125; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 126; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 127; AVX512-NEXT: retq 128 %aext = zext <16 x i32> %a to <16 x i64> 129 %bext = zext <16 x i32> %b to <16 x i64> 130 %sub = sub <16 x i64> %aext, %bext 131 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) 132 %trunc = trunc <16 x i64> %abs to <16 x i32> 133 ret <16 x i32> %trunc 134} 135 136define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind { 137; AVX512-LABEL: abd_ext_v16i32_undef: 138; AVX512: # %bb.0: 139; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2 140; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 141; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 142; AVX512-NEXT: retq 143 %aext = zext <16 x i32> %a to <16 x i64> 144 %bext = zext <16 x i32> %b to <16 x i64> 145 %sub = sub <16 x i64> %aext, %bext 146 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) 147 %trunc = trunc <16 x i64> %abs to <16 x i32> 148 ret <16 x i32> %trunc 149} 150 151define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 152; AVX512-LABEL: abd_ext_v8i64: 153; AVX512: # %bb.0: 154; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2 155; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 156; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 157; AVX512-NEXT: retq 158 %aext = zext <8 x i64> %a to <8 x i128> 159 %bext = zext <8 x i64> %b to <8 x i128> 160 %sub = sub <8 x i128> %aext, %bext 161 %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 false) 162 %trunc = trunc <8 x i128> %abs to <8 x i64> 163 ret <8 x i64> %trunc 164} 165 166define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind { 167; AVX512-LABEL: abd_ext_v8i64_undef: 168; AVX512: # %bb.0: 169; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2 170; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 171; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 172; AVX512-NEXT: retq 173 %aext = zext <8 x i64> %a to <8 x i128> 174 %bext = zext <8 x i64> %b to <8 x i128> 175 %sub = sub <8 x i128> %aext, %bext 176 %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 true) 177 %trunc = trunc <8 x i128> %abs to <8 x i64> 178 ret <8 x i64> %trunc 179} 180 181; 182; sub(umax(a,b),umin(a,b)) -> abdu(a,b) 183; 184 185define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 186; AVX512BW-LABEL: abd_minmax_v64i8: 187; AVX512BW: # %bb.0: 188; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 189; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 190; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 191; AVX512BW-NEXT: retq 192; 193; AVX512DQ-LABEL: abd_minmax_v64i8: 194; AVX512DQ: # %bb.0: 195; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 196; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 197; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4 198; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 199; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 200; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3 201; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 202; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 203; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 204; AVX512DQ-NEXT: retq 205 %min = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a, <64 x i8> %b) 206 %max = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %a, <64 x i8> %b) 207 %sub = sub <64 x i8> %max, %min 208 ret <64 x i8> %sub 209} 210 211define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 212; AVX512BW-LABEL: abd_minmax_v32i16: 213; AVX512BW: # %bb.0: 214; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 215; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 216; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 217; AVX512BW-NEXT: retq 218; 219; AVX512DQ-LABEL: abd_minmax_v32i16: 220; AVX512DQ: # %bb.0: 221; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 222; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 223; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4 224; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 225; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2 226; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3 227; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 228; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 229; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 230; AVX512DQ-NEXT: retq 231 %min = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a, <32 x i16> %b) 232 %max = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %a, <32 x i16> %b) 233 %sub = sub <32 x i16> %max, %min 234 ret <32 x i16> %sub 235} 236 237define <16 x i32> @abd_minmax_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 238; AVX512-LABEL: abd_minmax_v16i32: 239; AVX512: # %bb.0: 240; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2 241; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 242; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 243; AVX512-NEXT: retq 244 %min = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %a, <16 x i32> %b) 245 %max = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %a, <16 x i32> %b) 246 %sub = sub <16 x i32> %max, %min 247 ret <16 x i32> %sub 248} 249 250define <8 x i64> @abd_minmax_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 251; AVX512-LABEL: abd_minmax_v8i64: 252; AVX512: # %bb.0: 253; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2 254; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 255; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 256; AVX512-NEXT: retq 257 %min = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %a, <8 x i64> %b) 258 %max = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %a, <8 x i64> %b) 259 %sub = sub <8 x i64> %max, %min 260 ret <8 x i64> %sub 261} 262 263; 264; select(icmp(a,b),sub(a,b),sub(b,a)) -> abdu(a,b) 265; 266 267define <64 x i8> @abd_cmp_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 268; AVX512BW-LABEL: abd_cmp_v64i8: 269; AVX512BW: # %bb.0: 270; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 271; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 272; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 273; AVX512BW-NEXT: retq 274; 275; AVX512DQ-LABEL: abd_cmp_v64i8: 276; AVX512DQ: # %bb.0: 277; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 278; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 279; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4 280; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 281; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 282; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3 283; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 284; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 285; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 286; AVX512DQ-NEXT: retq 287 %cmp = icmp ugt <64 x i8> %a, %b 288 %ab = sub <64 x i8> %a, %b 289 %ba = sub <64 x i8> %b, %a 290 %sel = select <64 x i1> %cmp, <64 x i8> %ab, <64 x i8> %ba 291 ret <64 x i8> %sel 292} 293 294define <32 x i16> @abd_cmp_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 295; AVX512BW-LABEL: abd_cmp_v32i16: 296; AVX512BW: # %bb.0: 297; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 298; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 299; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 300; AVX512BW-NEXT: retq 301; 302; AVX512DQ-LABEL: abd_cmp_v32i16: 303; AVX512DQ: # %bb.0: 304; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 305; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 306; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4 307; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 308; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2 309; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3 310; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 311; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 312; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 313; AVX512DQ-NEXT: retq 314 %cmp = icmp uge <32 x i16> %a, %b 315 %ab = sub <32 x i16> %a, %b 316 %ba = sub <32 x i16> %b, %a 317 %sel = select <32 x i1> %cmp, <32 x i16> %ab, <32 x i16> %ba 318 ret <32 x i16> %sel 319} 320 321define <16 x i32> @abd_cmp_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 322; AVX512-LABEL: abd_cmp_v16i32: 323; AVX512: # %bb.0: 324; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2 325; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 326; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 327; AVX512-NEXT: retq 328 %cmp = icmp ult <16 x i32> %a, %b 329 %ab = sub <16 x i32> %a, %b 330 %ba = sub <16 x i32> %b, %a 331 %sel = select <16 x i1> %cmp, <16 x i32> %ba, <16 x i32> %ab 332 ret <16 x i32> %sel 333} 334 335define <8 x i64> @abd_cmp_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 336; AVX512-LABEL: abd_cmp_v8i64: 337; AVX512: # %bb.0: 338; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2 339; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 340; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 341; AVX512-NEXT: retq 342 %cmp = icmp uge <8 x i64> %a, %b 343 %ab = sub <8 x i64> %a, %b 344 %ba = sub <8 x i64> %b, %a 345 %sel = select <8 x i1> %cmp, <8 x i64> %ab, <8 x i64> %ba 346 ret <8 x i64> %sel 347} 348 349declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) 350declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) 351declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) 352declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) 353declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) 354declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) 355declare <64 x i64> @llvm.abs.v64i64(<64 x i64>, i1) 356declare <8 x i128> @llvm.abs.v8i128(<8 x i128>, i1) 357 358declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) 359declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) 360declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) 361declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) 362 363declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) 364declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) 365declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) 366declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) 367