1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 7 8; 9; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b) 10; 11 12define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 13; SSE-LABEL: abd_ext_v16i8: 14; SSE: # %bb.0: 15; SSE-NEXT: movdqa %xmm0, %xmm2 16; SSE-NEXT: pminub %xmm1, %xmm2 17; SSE-NEXT: pmaxub %xmm1, %xmm0 18; SSE-NEXT: psubb %xmm2, %xmm0 19; SSE-NEXT: retq 20; 21; AVX-LABEL: abd_ext_v16i8: 22; AVX: # %bb.0: 23; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm2 24; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 25; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 26; AVX-NEXT: retq 27 %aext = zext <16 x i8> %a to <16 x i64> 28 %bext = zext <16 x i8> %b to <16 x i64> 29 %sub = sub <16 x i64> %aext, %bext 30 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) 31 %trunc = trunc <16 x i64> %abs to <16 x i8> 32 ret <16 x i8> %trunc 33} 34 35define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind { 36; SSE-LABEL: abd_ext_v16i8_undef: 37; SSE: # %bb.0: 38; SSE-NEXT: movdqa %xmm0, %xmm2 39; SSE-NEXT: pminub %xmm1, %xmm2 40; SSE-NEXT: pmaxub %xmm1, %xmm0 41; SSE-NEXT: psubb %xmm2, %xmm0 42; SSE-NEXT: retq 43; 44; AVX-LABEL: abd_ext_v16i8_undef: 45; AVX: # %bb.0: 46; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm2 47; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 48; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 49; AVX-NEXT: retq 50 %aext = zext <16 x i8> %a to <16 x i64> 51 %bext = zext <16 x i8> %b to <16 x i64> 52 %sub = sub <16 x i64> %aext, %bext 53 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) 54 %trunc = trunc <16 x i64> %abs to <16 x i8> 55 ret <16 x i8> %trunc 56} 57 58define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 59; SSE2-LABEL: abd_ext_v8i16: 60; SSE2: # %bb.0: 61; SSE2-NEXT: movdqa %xmm1, %xmm2 62; SSE2-NEXT: psubusw %xmm0, %xmm2 63; SSE2-NEXT: psubusw %xmm1, %xmm0 64; SSE2-NEXT: por %xmm2, %xmm0 65; SSE2-NEXT: retq 66; 67; SSE42-LABEL: abd_ext_v8i16: 68; SSE42: # %bb.0: 69; SSE42-NEXT: movdqa %xmm0, %xmm2 70; SSE42-NEXT: pminuw %xmm1, %xmm2 71; SSE42-NEXT: pmaxuw %xmm1, %xmm0 72; SSE42-NEXT: psubw %xmm2, %xmm0 73; SSE42-NEXT: retq 74; 75; AVX-LABEL: abd_ext_v8i16: 76; AVX: # %bb.0: 77; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 78; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 79; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 80; AVX-NEXT: retq 81 %aext = zext <8 x i16> %a to <8 x i64> 82 %bext = zext <8 x i16> %b to <8 x i64> 83 %sub = sub <8 x i64> %aext, %bext 84 %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) 85 %trunc = trunc <8 x i64> %abs to <8 x i16> 86 ret <8 x i16> %trunc 87} 88 89define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind { 90; SSE2-LABEL: abd_ext_v8i16_undef: 91; SSE2: # %bb.0: 92; SSE2-NEXT: movdqa %xmm1, %xmm2 93; SSE2-NEXT: psubusw %xmm0, %xmm2 94; SSE2-NEXT: psubusw %xmm1, %xmm0 95; SSE2-NEXT: por %xmm2, %xmm0 96; SSE2-NEXT: retq 97; 98; SSE42-LABEL: abd_ext_v8i16_undef: 99; SSE42: # %bb.0: 100; SSE42-NEXT: movdqa %xmm0, %xmm2 101; SSE42-NEXT: pminuw %xmm1, %xmm2 102; SSE42-NEXT: pmaxuw %xmm1, %xmm0 103; SSE42-NEXT: psubw %xmm2, %xmm0 104; SSE42-NEXT: retq 105; 106; AVX-LABEL: abd_ext_v8i16_undef: 107; AVX: # %bb.0: 108; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 109; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 110; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 111; AVX-NEXT: retq 112 %aext = zext <8 x i16> %a to <8 x i64> 113 %bext = zext <8 x i16> %b to <8 x i64> 114 %sub = sub <8 x i64> %aext, %bext 115 %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) 116 %trunc = trunc <8 x i64> %abs to <8 x i16> 117 ret <8 x i16> %trunc 118} 119 120define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 121; SSE2-LABEL: abd_ext_v4i32: 122; SSE2: # %bb.0: 123; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 124; SSE2-NEXT: movdqa %xmm1, %xmm3 125; SSE2-NEXT: pxor %xmm2, %xmm3 126; SSE2-NEXT: pxor %xmm0, %xmm2 127; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 128; SSE2-NEXT: psubd %xmm1, %xmm0 129; SSE2-NEXT: pxor %xmm2, %xmm0 130; SSE2-NEXT: psubd %xmm0, %xmm2 131; SSE2-NEXT: movdqa %xmm2, %xmm0 132; SSE2-NEXT: retq 133; 134; SSE42-LABEL: abd_ext_v4i32: 135; SSE42: # %bb.0: 136; SSE42-NEXT: movdqa %xmm0, %xmm2 137; SSE42-NEXT: pminud %xmm1, %xmm2 138; SSE42-NEXT: pmaxud %xmm1, %xmm0 139; SSE42-NEXT: psubd %xmm2, %xmm0 140; SSE42-NEXT: retq 141; 142; AVX-LABEL: abd_ext_v4i32: 143; AVX: # %bb.0: 144; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 145; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 146; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 147; AVX-NEXT: retq 148 %aext = zext <4 x i32> %a to <4 x i64> 149 %bext = zext <4 x i32> %b to <4 x i64> 150 %sub = sub <4 x i64> %aext, %bext 151 %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false) 152 %trunc = trunc <4 x i64> %abs to <4 x i32> 153 ret <4 x i32> %trunc 154} 155 156define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { 157; SSE2-LABEL: abd_ext_v4i32_undef: 158; SSE2: # %bb.0: 159; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 160; SSE2-NEXT: movdqa %xmm1, %xmm3 161; SSE2-NEXT: pxor %xmm2, %xmm3 162; SSE2-NEXT: pxor %xmm0, %xmm2 163; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 164; SSE2-NEXT: psubd %xmm1, %xmm0 165; SSE2-NEXT: pxor %xmm2, %xmm0 166; SSE2-NEXT: psubd %xmm0, %xmm2 167; SSE2-NEXT: movdqa %xmm2, %xmm0 168; SSE2-NEXT: retq 169; 170; SSE42-LABEL: abd_ext_v4i32_undef: 171; SSE42: # %bb.0: 172; SSE42-NEXT: movdqa %xmm0, %xmm2 173; SSE42-NEXT: pminud %xmm1, %xmm2 174; SSE42-NEXT: pmaxud %xmm1, %xmm0 175; SSE42-NEXT: psubd %xmm2, %xmm0 176; SSE42-NEXT: retq 177; 178; AVX-LABEL: abd_ext_v4i32_undef: 179; AVX: # %bb.0: 180; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 181; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 182; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 183; AVX-NEXT: retq 184 %aext = zext <4 x i32> %a to <4 x i64> 185 %bext = zext <4 x i32> %b to <4 x i64> 186 %sub = sub <4 x i64> %aext, %bext 187 %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) 188 %trunc = trunc <4 x i64> %abs to <4 x i32> 189 ret <4 x i32> %trunc 190} 191 192define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 193; SSE2-LABEL: abd_ext_v2i64: 194; SSE2: # %bb.0: 195; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 196; SSE2-NEXT: movdqa %xmm1, %xmm3 197; SSE2-NEXT: pxor %xmm2, %xmm3 198; SSE2-NEXT: pxor %xmm0, %xmm2 199; SSE2-NEXT: movdqa %xmm2, %xmm4 200; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 201; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 202; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 203; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 204; SSE2-NEXT: pand %xmm5, %xmm3 205; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 206; SSE2-NEXT: por %xmm3, %xmm2 207; SSE2-NEXT: psubq %xmm1, %xmm0 208; SSE2-NEXT: pxor %xmm2, %xmm0 209; SSE2-NEXT: psubq %xmm0, %xmm2 210; SSE2-NEXT: movdqa %xmm2, %xmm0 211; SSE2-NEXT: retq 212; 213; SSE42-LABEL: abd_ext_v2i64: 214; SSE42: # %bb.0: 215; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 216; SSE42-NEXT: movdqa %xmm1, %xmm3 217; SSE42-NEXT: pxor %xmm2, %xmm3 218; SSE42-NEXT: pxor %xmm0, %xmm2 219; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 220; SSE42-NEXT: psubq %xmm1, %xmm0 221; SSE42-NEXT: pxor %xmm2, %xmm0 222; SSE42-NEXT: psubq %xmm0, %xmm2 223; SSE42-NEXT: movdqa %xmm2, %xmm0 224; SSE42-NEXT: retq 225; 226; AVX1-LABEL: abd_ext_v2i64: 227; AVX1: # %bb.0: 228; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 229; AVX1-NEXT: # xmm2 = mem[0,0] 230; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 231; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 232; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 233; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 234; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 235; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 236; AVX1-NEXT: retq 237; 238; AVX2-LABEL: abd_ext_v2i64: 239; AVX2: # %bb.0: 240; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 241; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 242; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 243; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 244; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 245; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 246; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 247; AVX2-NEXT: retq 248; 249; AVX512-LABEL: abd_ext_v2i64: 250; AVX512: # %bb.0: 251; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 252; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 253; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 254; AVX512-NEXT: retq 255 %aext = zext <2 x i64> %a to <2 x i128> 256 %bext = zext <2 x i64> %b to <2 x i128> 257 %sub = sub <2 x i128> %aext, %bext 258 %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 false) 259 %trunc = trunc <2 x i128> %abs to <2 x i64> 260 ret <2 x i64> %trunc 261} 262 263define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { 264; SSE2-LABEL: abd_ext_v2i64_undef: 265; SSE2: # %bb.0: 266; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 267; SSE2-NEXT: movdqa %xmm1, %xmm3 268; SSE2-NEXT: pxor %xmm2, %xmm3 269; SSE2-NEXT: pxor %xmm0, %xmm2 270; SSE2-NEXT: movdqa %xmm2, %xmm4 271; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 272; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 273; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 274; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 275; SSE2-NEXT: pand %xmm5, %xmm3 276; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 277; SSE2-NEXT: por %xmm3, %xmm2 278; SSE2-NEXT: psubq %xmm1, %xmm0 279; SSE2-NEXT: pxor %xmm2, %xmm0 280; SSE2-NEXT: psubq %xmm0, %xmm2 281; SSE2-NEXT: movdqa %xmm2, %xmm0 282; SSE2-NEXT: retq 283; 284; SSE42-LABEL: abd_ext_v2i64_undef: 285; SSE42: # %bb.0: 286; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 287; SSE42-NEXT: movdqa %xmm1, %xmm3 288; SSE42-NEXT: pxor %xmm2, %xmm3 289; SSE42-NEXT: pxor %xmm0, %xmm2 290; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 291; SSE42-NEXT: psubq %xmm1, %xmm0 292; SSE42-NEXT: pxor %xmm2, %xmm0 293; SSE42-NEXT: psubq %xmm0, %xmm2 294; SSE42-NEXT: movdqa %xmm2, %xmm0 295; SSE42-NEXT: retq 296; 297; AVX1-LABEL: abd_ext_v2i64_undef: 298; AVX1: # %bb.0: 299; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 300; AVX1-NEXT: # xmm2 = mem[0,0] 301; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 302; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 303; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 304; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 305; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 306; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 307; AVX1-NEXT: retq 308; 309; AVX2-LABEL: abd_ext_v2i64_undef: 310; AVX2: # %bb.0: 311; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 312; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 313; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 314; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 315; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 316; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 317; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 318; AVX2-NEXT: retq 319; 320; AVX512-LABEL: abd_ext_v2i64_undef: 321; AVX512: # %bb.0: 322; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 323; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 324; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 325; AVX512-NEXT: retq 326 %aext = zext <2 x i64> %a to <2 x i128> 327 %bext = zext <2 x i64> %b to <2 x i128> 328 %sub = sub <2 x i128> %aext, %bext 329 %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) 330 %trunc = trunc <2 x i128> %abs to <2 x i64> 331 ret <2 x i64> %trunc 332} 333 334; 335; sub(umax(a,b),umin(a,b)) -> abdu(a,b) 336; 337 338define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 339; SSE-LABEL: abd_minmax_v16i8: 340; SSE: # %bb.0: 341; SSE-NEXT: movdqa %xmm0, %xmm2 342; SSE-NEXT: pminub %xmm1, %xmm2 343; SSE-NEXT: pmaxub %xmm1, %xmm0 344; SSE-NEXT: psubb %xmm2, %xmm0 345; SSE-NEXT: retq 346; 347; AVX-LABEL: abd_minmax_v16i8: 348; AVX: # %bb.0: 349; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm2 350; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 351; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 352; AVX-NEXT: retq 353 %min = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) 354 %max = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) 355 %sub = sub <16 x i8> %max, %min 356 ret <16 x i8> %sub 357} 358 359define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 360; SSE2-LABEL: abd_minmax_v8i16: 361; SSE2: # %bb.0: 362; SSE2-NEXT: movdqa %xmm1, %xmm2 363; SSE2-NEXT: psubusw %xmm0, %xmm2 364; SSE2-NEXT: psubusw %xmm1, %xmm0 365; SSE2-NEXT: por %xmm2, %xmm0 366; SSE2-NEXT: retq 367; 368; SSE42-LABEL: abd_minmax_v8i16: 369; SSE42: # %bb.0: 370; SSE42-NEXT: movdqa %xmm0, %xmm2 371; SSE42-NEXT: pminuw %xmm1, %xmm2 372; SSE42-NEXT: pmaxuw %xmm1, %xmm0 373; SSE42-NEXT: psubw %xmm2, %xmm0 374; SSE42-NEXT: retq 375; 376; AVX-LABEL: abd_minmax_v8i16: 377; AVX: # %bb.0: 378; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 379; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 380; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 381; AVX-NEXT: retq 382 %min = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b) 383 %max = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b) 384 %sub = sub <8 x i16> %max, %min 385 ret <8 x i16> %sub 386} 387 388define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 389; SSE2-LABEL: abd_minmax_v4i32: 390; SSE2: # %bb.0: 391; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 392; SSE2-NEXT: movdqa %xmm1, %xmm3 393; SSE2-NEXT: pxor %xmm2, %xmm3 394; SSE2-NEXT: pxor %xmm0, %xmm2 395; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 396; SSE2-NEXT: psubd %xmm1, %xmm0 397; SSE2-NEXT: pxor %xmm2, %xmm0 398; SSE2-NEXT: psubd %xmm0, %xmm2 399; SSE2-NEXT: movdqa %xmm2, %xmm0 400; SSE2-NEXT: retq 401; 402; SSE42-LABEL: abd_minmax_v4i32: 403; SSE42: # %bb.0: 404; SSE42-NEXT: movdqa %xmm0, %xmm2 405; SSE42-NEXT: pminud %xmm1, %xmm2 406; SSE42-NEXT: pmaxud %xmm1, %xmm0 407; SSE42-NEXT: psubd %xmm2, %xmm0 408; SSE42-NEXT: retq 409; 410; AVX-LABEL: abd_minmax_v4i32: 411; AVX: # %bb.0: 412; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 413; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 414; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 415; AVX-NEXT: retq 416 %min = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) 417 %max = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) 418 %sub = sub <4 x i32> %max, %min 419 ret <4 x i32> %sub 420} 421 422define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 423; SSE2-LABEL: abd_minmax_v2i64: 424; SSE2: # %bb.0: 425; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 426; SSE2-NEXT: movdqa %xmm1, %xmm3 427; SSE2-NEXT: pxor %xmm2, %xmm3 428; SSE2-NEXT: pxor %xmm0, %xmm2 429; SSE2-NEXT: movdqa %xmm2, %xmm4 430; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 431; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 432; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 433; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 434; SSE2-NEXT: pand %xmm5, %xmm3 435; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 436; SSE2-NEXT: por %xmm3, %xmm2 437; SSE2-NEXT: psubq %xmm1, %xmm0 438; SSE2-NEXT: pxor %xmm2, %xmm0 439; SSE2-NEXT: psubq %xmm0, %xmm2 440; SSE2-NEXT: movdqa %xmm2, %xmm0 441; SSE2-NEXT: retq 442; 443; SSE42-LABEL: abd_minmax_v2i64: 444; SSE42: # %bb.0: 445; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 446; SSE42-NEXT: movdqa %xmm1, %xmm3 447; SSE42-NEXT: pxor %xmm2, %xmm3 448; SSE42-NEXT: pxor %xmm0, %xmm2 449; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 450; SSE42-NEXT: psubq %xmm1, %xmm0 451; SSE42-NEXT: pxor %xmm2, %xmm0 452; SSE42-NEXT: psubq %xmm0, %xmm2 453; SSE42-NEXT: movdqa %xmm2, %xmm0 454; SSE42-NEXT: retq 455; 456; AVX1-LABEL: abd_minmax_v2i64: 457; AVX1: # %bb.0: 458; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 459; AVX1-NEXT: # xmm2 = mem[0,0] 460; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 461; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 462; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 463; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 464; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 465; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 466; AVX1-NEXT: retq 467; 468; AVX2-LABEL: abd_minmax_v2i64: 469; AVX2: # %bb.0: 470; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 471; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 472; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 473; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 474; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 475; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 476; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 477; AVX2-NEXT: retq 478; 479; AVX512-LABEL: abd_minmax_v2i64: 480; AVX512: # %bb.0: 481; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 482; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 483; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 484; AVX512-NEXT: retq 485 %min = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) 486 %max = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) 487 %sub = sub <2 x i64> %max, %min 488 ret <2 x i64> %sub 489} 490 491; 492; select(icmp(a,b),sub(a,b),sub(b,a)) -> abdu(a,b) 493; 494 495define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 496; SSE-LABEL: abd_cmp_v16i8: 497; SSE: # %bb.0: 498; SSE-NEXT: movdqa %xmm0, %xmm2 499; SSE-NEXT: pminub %xmm1, %xmm2 500; SSE-NEXT: pmaxub %xmm1, %xmm0 501; SSE-NEXT: psubb %xmm2, %xmm0 502; SSE-NEXT: retq 503; 504; AVX-LABEL: abd_cmp_v16i8: 505; AVX: # %bb.0: 506; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm2 507; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 508; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 509; AVX-NEXT: retq 510 %cmp = icmp ugt <16 x i8> %a, %b 511 %ab = sub <16 x i8> %a, %b 512 %ba = sub <16 x i8> %b, %a 513 %sel = select <16 x i1> %cmp, <16 x i8> %ab, <16 x i8> %ba 514 ret <16 x i8> %sel 515} 516 517define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 518; SSE2-LABEL: abd_cmp_v8i16: 519; SSE2: # %bb.0: 520; SSE2-NEXT: movdqa %xmm1, %xmm2 521; SSE2-NEXT: psubusw %xmm0, %xmm2 522; SSE2-NEXT: psubusw %xmm1, %xmm0 523; SSE2-NEXT: por %xmm2, %xmm0 524; SSE2-NEXT: retq 525; 526; SSE42-LABEL: abd_cmp_v8i16: 527; SSE42: # %bb.0: 528; SSE42-NEXT: movdqa %xmm0, %xmm2 529; SSE42-NEXT: pminuw %xmm1, %xmm2 530; SSE42-NEXT: pmaxuw %xmm1, %xmm0 531; SSE42-NEXT: psubw %xmm2, %xmm0 532; SSE42-NEXT: retq 533; 534; AVX-LABEL: abd_cmp_v8i16: 535; AVX: # %bb.0: 536; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 537; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 538; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 539; AVX-NEXT: retq 540 %cmp = icmp uge <8 x i16> %a, %b 541 %ab = sub <8 x i16> %a, %b 542 %ba = sub <8 x i16> %b, %a 543 %sel = select <8 x i1> %cmp, <8 x i16> %ab, <8 x i16> %ba 544 ret <8 x i16> %sel 545} 546 547define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 548; SSE2-LABEL: abd_cmp_v4i32: 549; SSE2: # %bb.0: 550; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 551; SSE2-NEXT: movdqa %xmm1, %xmm3 552; SSE2-NEXT: pxor %xmm2, %xmm3 553; SSE2-NEXT: pxor %xmm0, %xmm2 554; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 555; SSE2-NEXT: psubd %xmm1, %xmm0 556; SSE2-NEXT: pxor %xmm2, %xmm0 557; SSE2-NEXT: psubd %xmm0, %xmm2 558; SSE2-NEXT: movdqa %xmm2, %xmm0 559; SSE2-NEXT: retq 560; 561; SSE42-LABEL: abd_cmp_v4i32: 562; SSE42: # %bb.0: 563; SSE42-NEXT: movdqa %xmm0, %xmm2 564; SSE42-NEXT: pminud %xmm1, %xmm2 565; SSE42-NEXT: pmaxud %xmm1, %xmm0 566; SSE42-NEXT: psubd %xmm2, %xmm0 567; SSE42-NEXT: retq 568; 569; AVX-LABEL: abd_cmp_v4i32: 570; AVX: # %bb.0: 571; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 572; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 573; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 574; AVX-NEXT: retq 575 %cmp = icmp ult <4 x i32> %a, %b 576 %ab = sub <4 x i32> %a, %b 577 %ba = sub <4 x i32> %b, %a 578 %sel = select <4 x i1> %cmp, <4 x i32> %ba, <4 x i32> %ab 579 ret <4 x i32> %sel 580} 581 582define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 583; SSE2-LABEL: abd_cmp_v2i64: 584; SSE2: # %bb.0: 585; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 586; SSE2-NEXT: movdqa %xmm1, %xmm3 587; SSE2-NEXT: pxor %xmm2, %xmm3 588; SSE2-NEXT: pxor %xmm0, %xmm2 589; SSE2-NEXT: movdqa %xmm2, %xmm4 590; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 591; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 592; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 593; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 594; SSE2-NEXT: pand %xmm5, %xmm3 595; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 596; SSE2-NEXT: por %xmm3, %xmm2 597; SSE2-NEXT: psubq %xmm1, %xmm0 598; SSE2-NEXT: pxor %xmm2, %xmm0 599; SSE2-NEXT: psubq %xmm0, %xmm2 600; SSE2-NEXT: movdqa %xmm2, %xmm0 601; SSE2-NEXT: retq 602; 603; SSE42-LABEL: abd_cmp_v2i64: 604; SSE42: # %bb.0: 605; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 606; SSE42-NEXT: movdqa %xmm1, %xmm3 607; SSE42-NEXT: pxor %xmm2, %xmm3 608; SSE42-NEXT: pxor %xmm0, %xmm2 609; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 610; SSE42-NEXT: psubq %xmm1, %xmm0 611; SSE42-NEXT: pxor %xmm2, %xmm0 612; SSE42-NEXT: psubq %xmm0, %xmm2 613; SSE42-NEXT: movdqa %xmm2, %xmm0 614; SSE42-NEXT: retq 615; 616; AVX1-LABEL: abd_cmp_v2i64: 617; AVX1: # %bb.0: 618; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 619; AVX1-NEXT: # xmm2 = mem[0,0] 620; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 621; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 622; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 623; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 624; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 625; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 626; AVX1-NEXT: retq 627; 628; AVX2-LABEL: abd_cmp_v2i64: 629; AVX2: # %bb.0: 630; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 631; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 632; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 633; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 634; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 635; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 636; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 637; AVX2-NEXT: retq 638; 639; AVX512-LABEL: abd_cmp_v2i64: 640; AVX512: # %bb.0: 641; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 642; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 643; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 644; AVX512-NEXT: retq 645 %cmp = icmp uge <2 x i64> %a, %b 646 %ab = sub <2 x i64> %a, %b 647 %ba = sub <2 x i64> %b, %a 648 %sel = select <2 x i1> %cmp, <2 x i64> %ab, <2 x i64> %ba 649 ret <2 x i64> %sel 650} 651 652; 653; Special cases 654; 655 656define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { 657; SSE2-LABEL: abd_cmp_v2i64_multiuse_cmp: 658; SSE2: # %bb.0: 659; SSE2-NEXT: movdqa %xmm0, %xmm2 660; SSE2-NEXT: psubq %xmm1, %xmm2 661; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] 662; SSE2-NEXT: pxor %xmm3, %xmm1 663; SSE2-NEXT: pxor %xmm3, %xmm0 664; SSE2-NEXT: movdqa %xmm0, %xmm3 665; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 666; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 667; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 668; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 669; SSE2-NEXT: pand %xmm4, %xmm0 670; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 671; SSE2-NEXT: por %xmm0, %xmm1 672; SSE2-NEXT: pxor %xmm1, %xmm2 673; SSE2-NEXT: movdqa %xmm1, %xmm0 674; SSE2-NEXT: psubq %xmm2, %xmm0 675; SSE2-NEXT: paddq %xmm1, %xmm0 676; SSE2-NEXT: retq 677; 678; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp: 679; SSE42: # %bb.0: 680; SSE42-NEXT: movdqa %xmm0, %xmm2 681; SSE42-NEXT: psubq %xmm1, %xmm2 682; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 683; SSE42-NEXT: pxor %xmm3, %xmm1 684; SSE42-NEXT: pxor %xmm3, %xmm0 685; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 686; SSE42-NEXT: pxor %xmm0, %xmm2 687; SSE42-NEXT: movdqa %xmm0, %xmm1 688; SSE42-NEXT: psubq %xmm2, %xmm1 689; SSE42-NEXT: paddq %xmm1, %xmm0 690; SSE42-NEXT: retq 691; 692; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp: 693; AVX1: # %bb.0: 694; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 695; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 696; AVX1-NEXT: # xmm3 = mem[0,0] 697; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 698; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 699; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 700; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm1 701; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 702; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 703; AVX1-NEXT: retq 704; 705; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp: 706; AVX2: # %bb.0: 707; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2 708; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 709; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 710; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0 711; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 712; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm1 713; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 714; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 715; AVX2-NEXT: retq 716; 717; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp: 718; AVX512: # %bb.0: 719; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 720; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 721; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 722; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 723; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 724; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 {%k1} 725; AVX512-NEXT: retq 726 %cmp = icmp ugt <2 x i64> %a, %b 727 %ab = sub <2 x i64> %a, %b 728 %ba = sub <2 x i64> %b, %a 729 %sel = select <2 x i1> %cmp, <2 x i64> %ab, <2 x i64> %ba 730 %ext = sext <2 x i1> %cmp to <2 x i64> 731 %res = add <2 x i64> %ext, %sel 732 ret <2 x i64> %res 733} 734 735define <8 x i16> @abd_cmp_v8i16_multiuse_sub(<8 x i16> %a, <8 x i16> %b) nounwind { 736; SSE2-LABEL: abd_cmp_v8i16_multiuse_sub: 737; SSE2: # %bb.0: 738; SSE2-NEXT: movdqa %xmm0, %xmm2 739; SSE2-NEXT: psubw %xmm1, %xmm2 740; SSE2-NEXT: movdqa %xmm1, %xmm3 741; SSE2-NEXT: psubusw %xmm0, %xmm3 742; SSE2-NEXT: psubusw %xmm1, %xmm0 743; SSE2-NEXT: por %xmm3, %xmm0 744; SSE2-NEXT: paddw %xmm2, %xmm0 745; SSE2-NEXT: retq 746; 747; SSE42-LABEL: abd_cmp_v8i16_multiuse_sub: 748; SSE42: # %bb.0: 749; SSE42-NEXT: movdqa %xmm0, %xmm2 750; SSE42-NEXT: psubw %xmm1, %xmm2 751; SSE42-NEXT: movdqa %xmm0, %xmm3 752; SSE42-NEXT: pminuw %xmm1, %xmm3 753; SSE42-NEXT: pmaxuw %xmm1, %xmm0 754; SSE42-NEXT: psubw %xmm3, %xmm0 755; SSE42-NEXT: paddw %xmm2, %xmm0 756; SSE42-NEXT: retq 757; 758; AVX-LABEL: abd_cmp_v8i16_multiuse_sub: 759; AVX: # %bb.0: 760; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 761; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm3 762; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 763; AVX-NEXT: vpsubw %xmm3, %xmm0, %xmm0 764; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 765; AVX-NEXT: retq 766 %cmp = icmp uge <8 x i16> %a, %b 767 %ab = sub <8 x i16> %a, %b 768 %ba = sub <8 x i16> %b, %a 769 %sel = select <8 x i1> %cmp, <8 x i16> %ab, <8 x i16> %ba 770 %res = add <8 x i16> %ab, %sel 771 ret <8 x i16> %res 772} 773 774declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) 775declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) 776declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) 777declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) 778declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) 779declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) 780declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) 781declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1) 782 783declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) 784declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) 785declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) 786declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) 787 788declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) 789declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) 790declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) 791declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) 792