1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 11 12declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) 13 14define <8 x i16> @test1(<8 x i16> %x) nounwind { 15; SSE-LABEL: test1: 16; SSE: # %bb.0: # %vector.ph 17; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 18; SSE-NEXT: retq 19; 20; AVX-LABEL: test1: 21; AVX: # %bb.0: # %vector.ph 22; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 23; AVX-NEXT: retq 24vector.ph: 25 %0 = icmp slt <8 x i16> %x, zeroinitializer 26 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 27 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer 28 ret <8 x i16> %res 29} 30 31; This is logically equivalent to the above. 32; usubsat X, (1 << (BW-1)) <--> (X ^ (1 << (BW-1))) & (ashr X, (BW-1)) 33 34define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind { 35; SSE-LABEL: ashr_xor_and: 36; SSE: # %bb.0: 37; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 38; SSE-NEXT: retq 39; 40; AVX-LABEL: ashr_xor_and: 41; AVX: # %bb.0: 42; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 43; AVX-NEXT: retq 44 %signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 45 %flipsign = xor <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768> 46 %res = and <8 x i16> %signsplat, %flipsign 47 ret <8 x i16> %res 48} 49 50define <8 x i16> @ashr_add_and(<8 x i16> %x) nounwind { 51; SSE-LABEL: ashr_add_and: 52; SSE: # %bb.0: 53; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 54; SSE-NEXT: retq 55; 56; AVX-LABEL: ashr_add_and: 57; AVX: # %bb.0: 58; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 59; AVX-NEXT: retq 60 %signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 61 %flipsign = add <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768> 62 %res = and <8 x i16> %signsplat, %flipsign 63 ret <8 x i16> %res 64} 65 66; negative test - extra uses may lead to extra instructions when custom-lowered 67 68define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) nounwind { 69; SSE-LABEL: ashr_xor_and_commute_uses: 70; SSE: # %bb.0: 71; SSE-NEXT: pxor %xmm1, %xmm1 72; SSE-NEXT: pcmpgtb %xmm0, %xmm1 73; SSE-NEXT: movdqa %xmm1, (%rdi) 74; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 75; SSE-NEXT: movdqa %xmm0, (%rsi) 76; SSE-NEXT: pand %xmm1, %xmm0 77; SSE-NEXT: retq 78; 79; AVX1-LABEL: ashr_xor_and_commute_uses: 80; AVX1: # %bb.0: 81; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 82; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 83; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 84; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 85; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 86; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 87; AVX1-NEXT: retq 88; 89; AVX2-LABEL: ashr_xor_and_commute_uses: 90; AVX2: # %bb.0: 91; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 92; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 93; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 94; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 95; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 96; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 97; AVX2-NEXT: retq 98; 99; AVX512-LABEL: ashr_xor_and_commute_uses: 100; AVX512: # %bb.0: 101; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 102; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 103; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 104; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 105; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 106; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 107; AVX512-NEXT: retq 108 %signsplat = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 109 store <16 x i8> %signsplat, ptr %p1 110 %flipsign = xor <16 x i8> %x, <i8 undef, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128> 111 store <16 x i8> %flipsign, ptr %p2 112 %res = and <16 x i8> %flipsign, %signsplat 113 ret <16 x i8> %res 114} 115 116define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind { 117; SSE2OR3-LABEL: ashr_xor_and_custom: 118; SSE2OR3: # %bb.0: 119; SSE2OR3-NEXT: movdqa %xmm0, %xmm1 120; SSE2OR3-NEXT: psrad $31, %xmm1 121; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 122; SSE2OR3-NEXT: pand %xmm1, %xmm0 123; SSE2OR3-NEXT: retq 124; 125; SSE41-LABEL: ashr_xor_and_custom: 126; SSE41: # %bb.0: 127; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 128; SSE41-NEXT: pmaxud %xmm1, %xmm0 129; SSE41-NEXT: psubd %xmm1, %xmm0 130; SSE41-NEXT: retq 131; 132; AVX1-LABEL: ashr_xor_and_custom: 133; AVX1: # %bb.0: 134; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 135; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 136; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 137; AVX1-NEXT: retq 138; 139; AVX2-LABEL: ashr_xor_and_custom: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 142; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 143; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 144; AVX2-NEXT: retq 145; 146; AVX512-LABEL: ashr_xor_and_custom: 147; AVX512: # %bb.0: 148; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 149; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) 150; AVX512-NEXT: retq 151 %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31> 152 %flipsign = xor <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> 153 %res = and <4 x i32> %flipsign, %signsplat 154 ret <4 x i32> %res 155} 156 157define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind { 158; SSE2OR3-LABEL: ashr_add_and_custom: 159; SSE2OR3: # %bb.0: 160; SSE2OR3-NEXT: movdqa %xmm0, %xmm1 161; SSE2OR3-NEXT: psrad $31, %xmm1 162; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 163; SSE2OR3-NEXT: pand %xmm1, %xmm0 164; SSE2OR3-NEXT: retq 165; 166; SSE41-LABEL: ashr_add_and_custom: 167; SSE41: # %bb.0: 168; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 169; SSE41-NEXT: pmaxud %xmm1, %xmm0 170; SSE41-NEXT: psubd %xmm1, %xmm0 171; SSE41-NEXT: retq 172; 173; AVX1-LABEL: ashr_add_and_custom: 174; AVX1: # %bb.0: 175; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 176; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 177; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 178; AVX1-NEXT: retq 179; 180; AVX2-LABEL: ashr_add_and_custom: 181; AVX2: # %bb.0: 182; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 183; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 184; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 185; AVX2-NEXT: retq 186; 187; AVX512-LABEL: ashr_add_and_custom: 188; AVX512: # %bb.0: 189; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 190; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) 191; AVX512-NEXT: retq 192 %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31> 193 %flipsign = add <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> 194 %res = and <4 x i32> %flipsign, %signsplat 195 ret <4 x i32> %res 196} 197 198; usubsat X, (1 << (BW-1)) <--> (X ^ (1 << (BW-1))) & (ashr X, (BW-1)) 199 200define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind { 201; SSE2OR3-LABEL: usubsat_custom: 202; SSE2OR3: # %bb.0: 203; SSE2OR3-NEXT: movdqa %xmm0, %xmm1 204; SSE2OR3-NEXT: psrad $31, %xmm1 205; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 206; SSE2OR3-NEXT: pand %xmm1, %xmm0 207; SSE2OR3-NEXT: retq 208; 209; SSE41-LABEL: usubsat_custom: 210; SSE41: # %bb.0: 211; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,u] 212; SSE41-NEXT: pmaxud %xmm1, %xmm0 213; SSE41-NEXT: psubd %xmm1, %xmm0 214; SSE41-NEXT: retq 215; 216; AVX1-LABEL: usubsat_custom: 217; AVX1: # %bb.0: 218; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 219; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 220; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 221; AVX1-NEXT: retq 222; 223; AVX2-LABEL: usubsat_custom: 224; AVX2: # %bb.0: 225; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 226; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 227; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 228; AVX2-NEXT: retq 229; 230; AVX512-LABEL: usubsat_custom: 231; AVX512: # %bb.0: 232; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 233; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) 234; AVX512-NEXT: retq 235 %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 undef>) 236 ret <4 x i32> %res 237} 238 239define <8 x i16> @test2(<8 x i16> %x) nounwind { 240; SSE-LABEL: test2: 241; SSE: # %bb.0: # %vector.ph 242; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 243; SSE-NEXT: retq 244; 245; AVX-LABEL: test2: 246; AVX: # %bb.0: # %vector.ph 247; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 248; AVX-NEXT: retq 249vector.ph: 250 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 251 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 252 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer 253 ret <8 x i16> %res 254} 255 256define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { 257; SSE-LABEL: test3: 258; SSE: # %bb.0: # %vector.ph 259; SSE-NEXT: movd %edi, %xmm1 260; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 261; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 262; SSE-NEXT: psubusw %xmm1, %xmm0 263; SSE-NEXT: retq 264; 265; AVX1-LABEL: test3: 266; AVX1: # %bb.0: # %vector.ph 267; AVX1-NEXT: vmovd %edi, %xmm1 268; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 269; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 270; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 271; AVX1-NEXT: retq 272; 273; AVX2-LABEL: test3: 274; AVX2: # %bb.0: # %vector.ph 275; AVX2-NEXT: vmovd %edi, %xmm1 276; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 277; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 278; AVX2-NEXT: retq 279; 280; AVX512-LABEL: test3: 281; AVX512: # %bb.0: # %vector.ph 282; AVX512-NEXT: vpbroadcastw %edi, %xmm1 283; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 284; AVX512-NEXT: retq 285vector.ph: 286 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 287 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 288 %1 = icmp ult <8 x i16> %x, %broadcast15 289 %2 = sub <8 x i16> %x, %broadcast15 290 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2 291 ret <8 x i16> %res 292} 293 294define <16 x i8> @test4(<16 x i8> %x) nounwind { 295; SSE-LABEL: test4: 296; SSE: # %bb.0: # %vector.ph 297; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 298; SSE-NEXT: retq 299; 300; AVX-LABEL: test4: 301; AVX: # %bb.0: # %vector.ph 302; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 303; AVX-NEXT: retq 304vector.ph: 305 %0 = icmp slt <16 x i8> %x, zeroinitializer 306 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 307 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer 308 ret <16 x i8> %res 309} 310 311define <16 x i8> @test5(<16 x i8> %x) nounwind { 312; SSE-LABEL: test5: 313; SSE: # %bb.0: # %vector.ph 314; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 315; SSE-NEXT: retq 316; 317; AVX-LABEL: test5: 318; AVX: # %bb.0: # %vector.ph 319; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 320; AVX-NEXT: retq 321vector.ph: 322 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 323 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 324 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer 325 ret <16 x i8> %res 326} 327 328define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind { 329; SSE2-LABEL: test6: 330; SSE2: # %bb.0: # %vector.ph 331; SSE2-NEXT: movd %edi, %xmm1 332; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 333; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 334; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 335; SSE2-NEXT: psubusb %xmm1, %xmm0 336; SSE2-NEXT: retq 337; 338; SSSE3-LABEL: test6: 339; SSSE3: # %bb.0: # %vector.ph 340; SSSE3-NEXT: movd %edi, %xmm1 341; SSSE3-NEXT: pxor %xmm2, %xmm2 342; SSSE3-NEXT: pshufb %xmm2, %xmm1 343; SSSE3-NEXT: psubusb %xmm1, %xmm0 344; SSSE3-NEXT: retq 345; 346; SSE41-LABEL: test6: 347; SSE41: # %bb.0: # %vector.ph 348; SSE41-NEXT: movd %edi, %xmm1 349; SSE41-NEXT: pxor %xmm2, %xmm2 350; SSE41-NEXT: pshufb %xmm2, %xmm1 351; SSE41-NEXT: psubusb %xmm1, %xmm0 352; SSE41-NEXT: retq 353; 354; AVX1-LABEL: test6: 355; AVX1: # %bb.0: # %vector.ph 356; AVX1-NEXT: vmovd %edi, %xmm1 357; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 358; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 359; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 360; AVX1-NEXT: retq 361; 362; AVX2-LABEL: test6: 363; AVX2: # %bb.0: # %vector.ph 364; AVX2-NEXT: vmovd %edi, %xmm1 365; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 366; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 367; AVX2-NEXT: retq 368; 369; AVX512-LABEL: test6: 370; AVX512: # %bb.0: # %vector.ph 371; AVX512-NEXT: vpbroadcastb %edi, %xmm1 372; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 373; AVX512-NEXT: retq 374vector.ph: 375 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 376 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 377 %1 = icmp ult <16 x i8> %x, %broadcast15 378 %2 = sub <16 x i8> %x, %broadcast15 379 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2 380 ret <16 x i8> %res 381} 382 383define <16 x i16> @test7(<16 x i16> %x) nounwind { 384; SSE-LABEL: test7: 385; SSE: # %bb.0: # %vector.ph 386; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 387; SSE-NEXT: psubusw %xmm2, %xmm0 388; SSE-NEXT: psubusw %xmm2, %xmm1 389; SSE-NEXT: retq 390; 391; AVX1-LABEL: test7: 392; AVX1: # %bb.0: # %vector.ph 393; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 394; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 395; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 396; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 397; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 398; AVX1-NEXT: retq 399; 400; AVX2-LABEL: test7: 401; AVX2: # %bb.0: # %vector.ph 402; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 403; AVX2-NEXT: retq 404; 405; AVX512-LABEL: test7: 406; AVX512: # %bb.0: # %vector.ph 407; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 408; AVX512-NEXT: retq 409vector.ph: 410 %0 = icmp slt <16 x i16> %x, zeroinitializer 411 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 412 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 413 ret <16 x i16> %res 414} 415 416define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind { 417; SSE-LABEL: ashr_xor_and_v16i16: 418; SSE: # %bb.0: 419; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 420; SSE-NEXT: psubusw %xmm2, %xmm0 421; SSE-NEXT: psubusw %xmm2, %xmm1 422; SSE-NEXT: retq 423; 424; AVX1-LABEL: ashr_xor_and_v16i16: 425; AVX1: # %bb.0: 426; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 427; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 428; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 429; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 430; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 431; AVX1-NEXT: retq 432; 433; AVX2-LABEL: ashr_xor_and_v16i16: 434; AVX2: # %bb.0: 435; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 436; AVX2-NEXT: retq 437; 438; AVX512-LABEL: ashr_xor_and_v16i16: 439; AVX512: # %bb.0: 440; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 441; AVX512-NEXT: retq 442 %signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 443 %flipsign = xor <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768> 444 %res = and <16 x i16> %signsplat, %flipsign 445 ret <16 x i16> %res 446} 447 448define <16 x i16> @ashr_add_and_v16i16(<16 x i16> %x) nounwind { 449; SSE-LABEL: ashr_add_and_v16i16: 450; SSE: # %bb.0: 451; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 452; SSE-NEXT: psubusw %xmm2, %xmm0 453; SSE-NEXT: psubusw %xmm2, %xmm1 454; SSE-NEXT: retq 455; 456; AVX1-LABEL: ashr_add_and_v16i16: 457; AVX1: # %bb.0: 458; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 459; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 460; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 461; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 462; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 463; AVX1-NEXT: retq 464; 465; AVX2-LABEL: ashr_add_and_v16i16: 466; AVX2: # %bb.0: 467; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 468; AVX2-NEXT: retq 469; 470; AVX512-LABEL: ashr_add_and_v16i16: 471; AVX512: # %bb.0: 472; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 473; AVX512-NEXT: retq 474 %signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 475 %flipsign = add <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768> 476 %res = and <16 x i16> %signsplat, %flipsign 477 ret <16 x i16> %res 478} 479 480define <16 x i16> @test8(<16 x i16> %x) nounwind { 481; SSE-LABEL: test8: 482; SSE: # %bb.0: # %vector.ph 483; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 484; SSE-NEXT: psubusw %xmm2, %xmm0 485; SSE-NEXT: psubusw %xmm2, %xmm1 486; SSE-NEXT: retq 487; 488; AVX1-LABEL: test8: 489; AVX1: # %bb.0: # %vector.ph 490; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 491; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 492; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 493; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 494; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 495; AVX1-NEXT: retq 496; 497; AVX2-LABEL: test8: 498; AVX2: # %bb.0: # %vector.ph 499; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 500; AVX2-NEXT: retq 501; 502; AVX512-LABEL: test8: 503; AVX512: # %bb.0: # %vector.ph 504; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 505; AVX512-NEXT: retq 506vector.ph: 507 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 508 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 509 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 510 ret <16 x i16> %res 511} 512 513define <16 x i16> @test8a(<16 x i16> %x) nounwind { 514; SSE-LABEL: test8a: 515; SSE: # %bb.0: # %vector.ph 516; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 517; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 518; SSE-NEXT: retq 519; 520; AVX1-LABEL: test8a: 521; AVX1: # %bb.0: # %vector.ph 522; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 523; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 524; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 525; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 526; AVX1-NEXT: retq 527; 528; AVX2-LABEL: test8a: 529; AVX2: # %bb.0: # %vector.ph 530; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 531; AVX2-NEXT: retq 532; 533; AVX512-LABEL: test8a: 534; AVX512: # %bb.0: # %vector.ph 535; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 536; AVX512-NEXT: retq 537vector.ph: 538 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751> 539 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752> 540 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 541 ret <16 x i16> %res 542} 543 544define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { 545; SSE-LABEL: test9: 546; SSE: # %bb.0: # %vector.ph 547; SSE-NEXT: movd %edi, %xmm2 548; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 549; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 550; SSE-NEXT: psubusw %xmm2, %xmm0 551; SSE-NEXT: psubusw %xmm2, %xmm1 552; SSE-NEXT: retq 553; 554; AVX1-LABEL: test9: 555; AVX1: # %bb.0: # %vector.ph 556; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 557; AVX1-NEXT: vmovd %edi, %xmm2 558; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 559; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 560; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 561; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 562; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 563; AVX1-NEXT: retq 564; 565; AVX2-LABEL: test9: 566; AVX2: # %bb.0: # %vector.ph 567; AVX2-NEXT: vmovd %edi, %xmm1 568; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 569; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 570; AVX2-NEXT: retq 571; 572; AVX512-LABEL: test9: 573; AVX512: # %bb.0: # %vector.ph 574; AVX512-NEXT: vpbroadcastw %edi, %ymm1 575; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 576; AVX512-NEXT: retq 577vector.ph: 578 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 579 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 580 %1 = icmp ult <16 x i16> %x, %broadcast15 581 %2 = sub <16 x i16> %x, %broadcast15 582 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2 583 ret <16 x i16> %res 584} 585 586define <32 x i8> @test10(<32 x i8> %x) nounwind { 587; SSE-LABEL: test10: 588; SSE: # %bb.0: # %vector.ph 589; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 590; SSE-NEXT: psubusb %xmm2, %xmm0 591; SSE-NEXT: psubusb %xmm2, %xmm1 592; SSE-NEXT: retq 593; 594; AVX1-LABEL: test10: 595; AVX1: # %bb.0: # %vector.ph 596; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 597; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 598; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 599; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 600; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 601; AVX1-NEXT: retq 602; 603; AVX2-LABEL: test10: 604; AVX2: # %bb.0: # %vector.ph 605; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 606; AVX2-NEXT: retq 607; 608; AVX512-LABEL: test10: 609; AVX512: # %bb.0: # %vector.ph 610; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 611; AVX512-NEXT: retq 612vector.ph: 613 %0 = icmp slt <32 x i8> %x, zeroinitializer 614 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 615 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 616 ret <32 x i8> %res 617} 618 619define <32 x i8> @test11(<32 x i8> %x) nounwind { 620; SSE-LABEL: test11: 621; SSE: # %bb.0: # %vector.ph 622; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 623; SSE-NEXT: psubusb %xmm2, %xmm0 624; SSE-NEXT: psubusb %xmm2, %xmm1 625; SSE-NEXT: retq 626; 627; AVX1-LABEL: test11: 628; AVX1: # %bb.0: # %vector.ph 629; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 630; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 631; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 632; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 633; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 634; AVX1-NEXT: retq 635; 636; AVX2-LABEL: test11: 637; AVX2: # %bb.0: # %vector.ph 638; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 639; AVX2-NEXT: retq 640; 641; AVX512-LABEL: test11: 642; AVX512: # %bb.0: # %vector.ph 643; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 644; AVX512-NEXT: retq 645vector.ph: 646 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 647 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 648 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 649 ret <32 x i8> %res 650} 651 652define <32 x i8> @test11a(<32 x i8> %x) nounwind { 653; SSE-LABEL: test11a: 654; SSE: # %bb.0: # %vector.ph 655; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 656; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 657; SSE-NEXT: retq 658; 659; AVX1-LABEL: test11a: 660; AVX1: # %bb.0: # %vector.ph 661; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 662; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 663; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 664; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 665; AVX1-NEXT: retq 666; 667; AVX2-LABEL: test11a: 668; AVX2: # %bb.0: # %vector.ph 669; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 670; AVX2-NEXT: retq 671; 672; AVX512-LABEL: test11a: 673; AVX512: # %bb.0: # %vector.ph 674; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 675; AVX512-NEXT: retq 676vector.ph: 677 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95> 678 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96> 679 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 680 ret <32 x i8> %res 681} 682 683define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind { 684; SSE2-LABEL: test12: 685; SSE2: # %bb.0: # %vector.ph 686; SSE2-NEXT: movd %edi, %xmm2 687; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 688; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 689; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 690; SSE2-NEXT: psubusb %xmm2, %xmm0 691; SSE2-NEXT: psubusb %xmm2, %xmm1 692; SSE2-NEXT: retq 693; 694; SSSE3-LABEL: test12: 695; SSSE3: # %bb.0: # %vector.ph 696; SSSE3-NEXT: movd %edi, %xmm2 697; SSSE3-NEXT: pxor %xmm3, %xmm3 698; SSSE3-NEXT: pshufb %xmm3, %xmm2 699; SSSE3-NEXT: psubusb %xmm2, %xmm0 700; SSSE3-NEXT: psubusb %xmm2, %xmm1 701; SSSE3-NEXT: retq 702; 703; SSE41-LABEL: test12: 704; SSE41: # %bb.0: # %vector.ph 705; SSE41-NEXT: movd %edi, %xmm2 706; SSE41-NEXT: pxor %xmm3, %xmm3 707; SSE41-NEXT: pshufb %xmm3, %xmm2 708; SSE41-NEXT: psubusb %xmm2, %xmm0 709; SSE41-NEXT: psubusb %xmm2, %xmm1 710; SSE41-NEXT: retq 711; 712; AVX1-LABEL: test12: 713; AVX1: # %bb.0: # %vector.ph 714; AVX1-NEXT: vmovd %edi, %xmm1 715; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 716; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 717; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 718; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2 719; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 720; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 721; AVX1-NEXT: retq 722; 723; AVX2-LABEL: test12: 724; AVX2: # %bb.0: # %vector.ph 725; AVX2-NEXT: vmovd %edi, %xmm1 726; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 727; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 728; AVX2-NEXT: retq 729; 730; AVX512-LABEL: test12: 731; AVX512: # %bb.0: # %vector.ph 732; AVX512-NEXT: vpbroadcastb %edi, %ymm1 733; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 734; AVX512-NEXT: retq 735vector.ph: 736 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 737 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 738 %1 = icmp ult <32 x i8> %x, %broadcast15 739 %2 = sub <32 x i8> %x, %broadcast15 740 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2 741 ret <32 x i8> %res 742} 743 744define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { 745; SSE2-LABEL: test13: 746; SSE2: # %bb.0: # %vector.ph 747; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 748; SSE2-NEXT: movdqa %xmm2, %xmm4 749; SSE2-NEXT: pxor %xmm3, %xmm4 750; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 751; SSE2-NEXT: movdqa %xmm5, %xmm6 752; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 753; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 754; SSE2-NEXT: pand %xmm6, %xmm2 755; SSE2-NEXT: pxor %xmm4, %xmm6 756; SSE2-NEXT: por %xmm2, %xmm6 757; SSE2-NEXT: pslld $16, %xmm6 758; SSE2-NEXT: psrad $16, %xmm6 759; SSE2-NEXT: pxor %xmm1, %xmm3 760; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 761; SSE2-NEXT: pxor %xmm5, %xmm4 762; SSE2-NEXT: pand %xmm1, %xmm5 763; SSE2-NEXT: por %xmm4, %xmm5 764; SSE2-NEXT: pslld $16, %xmm5 765; SSE2-NEXT: psrad $16, %xmm5 766; SSE2-NEXT: packssdw %xmm6, %xmm5 767; SSE2-NEXT: psubusw %xmm5, %xmm0 768; SSE2-NEXT: retq 769; 770; SSSE3-LABEL: test13: 771; SSSE3: # %bb.0: # %vector.ph 772; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 773; SSSE3-NEXT: movdqa %xmm2, %xmm4 774; SSSE3-NEXT: pxor %xmm3, %xmm4 775; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 776; SSSE3-NEXT: movdqa %xmm5, %xmm6 777; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 778; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 779; SSSE3-NEXT: pand %xmm6, %xmm2 780; SSSE3-NEXT: pxor %xmm4, %xmm6 781; SSSE3-NEXT: por %xmm2, %xmm6 782; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 783; SSSE3-NEXT: pshufb %xmm2, %xmm6 784; SSSE3-NEXT: pxor %xmm1, %xmm3 785; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 786; SSSE3-NEXT: pxor %xmm5, %xmm4 787; SSSE3-NEXT: pand %xmm1, %xmm5 788; SSSE3-NEXT: por %xmm4, %xmm5 789; SSSE3-NEXT: pshufb %xmm2, %xmm5 790; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 791; SSSE3-NEXT: psubusw %xmm5, %xmm0 792; SSSE3-NEXT: retq 793; 794; SSE41-LABEL: test13: 795; SSE41: # %bb.0: # %vector.ph 796; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 797; SSE41-NEXT: pminud %xmm3, %xmm2 798; SSE41-NEXT: pminud %xmm3, %xmm1 799; SSE41-NEXT: packusdw %xmm2, %xmm1 800; SSE41-NEXT: psubusw %xmm1, %xmm0 801; SSE41-NEXT: retq 802; 803; AVX1-LABEL: test13: 804; AVX1: # %bb.0: # %vector.ph 805; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 806; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 807; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 808; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 809; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 810; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 811; AVX1-NEXT: vzeroupper 812; AVX1-NEXT: retq 813; 814; AVX2-LABEL: test13: 815; AVX2: # %bb.0: # %vector.ph 816; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 817; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 818; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 819; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 820; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 821; AVX2-NEXT: vzeroupper 822; AVX2-NEXT: retq 823; 824; AVX512-LABEL: test13: 825; AVX512: # %bb.0: # %vector.ph 826; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 827; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 828; AVX512-NEXT: vzeroupper 829; AVX512-NEXT: retq 830vector.ph: 831 %lhs = zext <8 x i16> %x to <8 x i32> 832 %cond = icmp ult <8 x i32> %lhs, %y 833 %sub = sub <8 x i32> %lhs, %y 834 %trunc = trunc <8 x i32> %sub to <8 x i16> 835 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc 836 ret <8 x i16> %res 837} 838 839; FIXME: match this to UMIN+TRUNC+PSUBUS 840define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { 841; SSE2OR3-LABEL: test14: 842; SSE2OR3: # %bb.0: # %vector.ph 843; SSE2OR3-NEXT: pxor %xmm6, %xmm6 844; SSE2OR3-NEXT: movdqa %xmm0, %xmm5 845; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 846; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 847; SSE2OR3-NEXT: movdqa %xmm2, %xmm9 848; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 849; SSE2OR3-NEXT: pand %xmm10, %xmm4 850; SSE2OR3-NEXT: pand %xmm10, %xmm3 851; SSE2OR3-NEXT: packuswb %xmm4, %xmm3 852; SSE2OR3-NEXT: movdqa %xmm1, %xmm4 853; SSE2OR3-NEXT: pand %xmm10, %xmm2 854; SSE2OR3-NEXT: pand %xmm10, %xmm1 855; SSE2OR3-NEXT: packuswb %xmm2, %xmm1 856; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 857; SSE2OR3-NEXT: psubb %xmm0, %xmm1 858; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 859; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 860; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 861; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 862; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 863; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] 864; SSE2OR3-NEXT: movdqa %xmm5, %xmm3 865; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 866; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 867; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] 868; SSE2OR3-NEXT: pxor %xmm6, %xmm7 869; SSE2OR3-NEXT: por %xmm6, %xmm5 870; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5 871; SSE2OR3-NEXT: pxor %xmm6, %xmm8 872; SSE2OR3-NEXT: por %xmm6, %xmm3 873; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3 874; SSE2OR3-NEXT: packssdw %xmm5, %xmm3 875; SSE2OR3-NEXT: pxor %xmm6, %xmm9 876; SSE2OR3-NEXT: por %xmm6, %xmm2 877; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2 878; SSE2OR3-NEXT: pxor %xmm6, %xmm4 879; SSE2OR3-NEXT: por %xmm6, %xmm0 880; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 881; SSE2OR3-NEXT: packssdw %xmm2, %xmm0 882; SSE2OR3-NEXT: packsswb %xmm3, %xmm0 883; SSE2OR3-NEXT: pandn %xmm1, %xmm0 884; SSE2OR3-NEXT: retq 885; 886; SSE41-LABEL: test14: 887; SSE41: # %bb.0: # %vector.ph 888; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] 889; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 890; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 891; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] 892; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 893; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] 894; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 895; SSE41-NEXT: pmaxud %xmm4, %xmm8 896; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 897; SSE41-NEXT: pmaxud %xmm3, %xmm7 898; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 899; SSE41-NEXT: packssdw %xmm8, %xmm7 900; SSE41-NEXT: pmaxud %xmm1, %xmm6 901; SSE41-NEXT: pcmpeqd %xmm1, %xmm6 902; SSE41-NEXT: pmaxud %xmm2, %xmm5 903; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 904; SSE41-NEXT: packssdw %xmm5, %xmm6 905; SSE41-NEXT: packsswb %xmm7, %xmm6 906; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255] 907; SSE41-NEXT: pand %xmm5, %xmm4 908; SSE41-NEXT: pand %xmm5, %xmm3 909; SSE41-NEXT: packusdw %xmm4, %xmm3 910; SSE41-NEXT: pand %xmm5, %xmm2 911; SSE41-NEXT: pand %xmm1, %xmm5 912; SSE41-NEXT: packusdw %xmm2, %xmm5 913; SSE41-NEXT: packuswb %xmm3, %xmm5 914; SSE41-NEXT: psubb %xmm0, %xmm5 915; SSE41-NEXT: pand %xmm6, %xmm5 916; SSE41-NEXT: movdqa %xmm5, %xmm0 917; SSE41-NEXT: retq 918; 919; AVX1-LABEL: test14: 920; AVX1: # %bb.0: # %vector.ph 921; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 922; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 923; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 924; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 925; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 926; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] 927; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 928; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 929; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6 930; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 931; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm5 932; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm5 933; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 934; AVX1-NEXT: vpmaxud %xmm4, %xmm1, %xmm4 935; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm4 936; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 937; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm3 938; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 939; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 940; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 941; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 942; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 943; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 944; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 945; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 946; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 947; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 948; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 949; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 950; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 951; AVX1-NEXT: vzeroupper 952; AVX1-NEXT: retq 953; 954; AVX2-LABEL: test14: 955; AVX2: # %bb.0: # %vector.ph 956; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 957; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 958; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 959; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4 960; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4 961; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3 962; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3 963; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 964; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 965; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 966; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 967; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3 968; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] 969; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 970; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 971; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 972; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 973; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 974; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 975; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 976; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 977; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 978; AVX2-NEXT: vzeroupper 979; AVX2-NEXT: retq 980; 981; AVX512-LABEL: test14: 982; AVX512: # %bb.0: # %vector.ph 983; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 984; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1 985; AVX512-NEXT: vpmovdb %zmm1, %xmm1 986; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z} 987; AVX512-NEXT: vzeroupper 988; AVX512-NEXT: retq 989vector.ph: 990 %rhs = zext <16 x i8> %x to <16 x i32> 991 %cond = icmp ult <16 x i32> %y, %rhs 992 %sub = sub <16 x i32> %y, %rhs 993 %truncsub = trunc <16 x i32> %sub to <16 x i8> 994 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub 995 ret <16 x i8> %res 996} 997 998define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { 999; SSE2-LABEL: test15: 1000; SSE2: # %bb.0: # %vector.ph 1001; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1002; SSE2-NEXT: movdqa %xmm2, %xmm4 1003; SSE2-NEXT: pxor %xmm3, %xmm4 1004; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1005; SSE2-NEXT: movdqa %xmm5, %xmm6 1006; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 1007; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1008; SSE2-NEXT: pand %xmm6, %xmm2 1009; SSE2-NEXT: pxor %xmm4, %xmm6 1010; SSE2-NEXT: por %xmm2, %xmm6 1011; SSE2-NEXT: pslld $16, %xmm6 1012; SSE2-NEXT: psrad $16, %xmm6 1013; SSE2-NEXT: pxor %xmm1, %xmm3 1014; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 1015; SSE2-NEXT: pxor %xmm5, %xmm4 1016; SSE2-NEXT: pand %xmm1, %xmm5 1017; SSE2-NEXT: por %xmm4, %xmm5 1018; SSE2-NEXT: pslld $16, %xmm5 1019; SSE2-NEXT: psrad $16, %xmm5 1020; SSE2-NEXT: packssdw %xmm6, %xmm5 1021; SSE2-NEXT: psubusw %xmm5, %xmm0 1022; SSE2-NEXT: retq 1023; 1024; SSSE3-LABEL: test15: 1025; SSSE3: # %bb.0: # %vector.ph 1026; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1027; SSSE3-NEXT: movdqa %xmm2, %xmm4 1028; SSSE3-NEXT: pxor %xmm3, %xmm4 1029; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1030; SSSE3-NEXT: movdqa %xmm5, %xmm6 1031; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 1032; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 1033; SSSE3-NEXT: pand %xmm6, %xmm2 1034; SSSE3-NEXT: pxor %xmm4, %xmm6 1035; SSSE3-NEXT: por %xmm2, %xmm6 1036; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1037; SSSE3-NEXT: pshufb %xmm2, %xmm6 1038; SSSE3-NEXT: pxor %xmm1, %xmm3 1039; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 1040; SSSE3-NEXT: pxor %xmm5, %xmm4 1041; SSSE3-NEXT: pand %xmm1, %xmm5 1042; SSSE3-NEXT: por %xmm4, %xmm5 1043; SSSE3-NEXT: pshufb %xmm2, %xmm5 1044; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 1045; SSSE3-NEXT: psubusw %xmm5, %xmm0 1046; SSSE3-NEXT: retq 1047; 1048; SSE41-LABEL: test15: 1049; SSE41: # %bb.0: # %vector.ph 1050; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 1051; SSE41-NEXT: pminud %xmm3, %xmm2 1052; SSE41-NEXT: pminud %xmm3, %xmm1 1053; SSE41-NEXT: packusdw %xmm2, %xmm1 1054; SSE41-NEXT: psubusw %xmm1, %xmm0 1055; SSE41-NEXT: retq 1056; 1057; AVX1-LABEL: test15: 1058; AVX1: # %bb.0: # %vector.ph 1059; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1060; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 1061; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 1062; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 1063; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1064; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1065; AVX1-NEXT: vzeroupper 1066; AVX1-NEXT: retq 1067; 1068; AVX2-LABEL: test15: 1069; AVX2: # %bb.0: # %vector.ph 1070; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 1071; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 1072; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1073; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1074; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1075; AVX2-NEXT: vzeroupper 1076; AVX2-NEXT: retq 1077; 1078; AVX512-LABEL: test15: 1079; AVX512: # %bb.0: # %vector.ph 1080; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 1081; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1082; AVX512-NEXT: vzeroupper 1083; AVX512-NEXT: retq 1084vector.ph: 1085 %lhs = zext <8 x i16> %x to <8 x i32> 1086 %cond = icmp ugt <8 x i32> %lhs, %y 1087 %sub = sub <8 x i32> %lhs, %y 1088 %truncsub = trunc <8 x i32> %sub to <8 x i16> 1089 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer 1090 ret <8 x i16> %res 1091} 1092 1093; FIXME: match this to UMIN+TRUNC+PSUBUS 1094define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind { 1095; SSE2-LABEL: test16: 1096; SSE2: # %bb.0: # %vector.ph 1097; SSE2-NEXT: pxor %xmm3, %xmm3 1098; SSE2-NEXT: movdqa %xmm0, %xmm4 1099; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1100; SSE2-NEXT: movdqa %xmm0, %xmm5 1101; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1102; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1103; SSE2-NEXT: movdqa %xmm2, %xmm6 1104; SSE2-NEXT: pxor %xmm3, %xmm6 1105; SSE2-NEXT: por %xmm3, %xmm5 1106; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 1107; SSE2-NEXT: movdqa %xmm1, %xmm6 1108; SSE2-NEXT: pxor %xmm3, %xmm6 1109; SSE2-NEXT: por %xmm3, %xmm4 1110; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 1111; SSE2-NEXT: packssdw %xmm5, %xmm4 1112; SSE2-NEXT: pslld $16, %xmm2 1113; SSE2-NEXT: psrad $16, %xmm2 1114; SSE2-NEXT: pslld $16, %xmm1 1115; SSE2-NEXT: psrad $16, %xmm1 1116; SSE2-NEXT: packssdw %xmm2, %xmm1 1117; SSE2-NEXT: psubw %xmm1, %xmm0 1118; SSE2-NEXT: pand %xmm4, %xmm0 1119; SSE2-NEXT: retq 1120; 1121; SSSE3-LABEL: test16: 1122; SSSE3: # %bb.0: # %vector.ph 1123; SSSE3-NEXT: pxor %xmm3, %xmm3 1124; SSSE3-NEXT: movdqa %xmm0, %xmm4 1125; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1126; SSSE3-NEXT: movdqa %xmm0, %xmm5 1127; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1128; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1129; SSSE3-NEXT: movdqa %xmm2, %xmm6 1130; SSSE3-NEXT: pxor %xmm3, %xmm6 1131; SSSE3-NEXT: por %xmm3, %xmm5 1132; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 1133; SSSE3-NEXT: movdqa %xmm1, %xmm6 1134; SSSE3-NEXT: pxor %xmm3, %xmm6 1135; SSSE3-NEXT: por %xmm3, %xmm4 1136; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 1137; SSSE3-NEXT: packssdw %xmm5, %xmm4 1138; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1139; SSSE3-NEXT: pshufb %xmm3, %xmm2 1140; SSSE3-NEXT: pshufb %xmm3, %xmm1 1141; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1142; SSSE3-NEXT: psubw %xmm1, %xmm0 1143; SSSE3-NEXT: pand %xmm4, %xmm0 1144; SSSE3-NEXT: retq 1145; 1146; SSE41-LABEL: test16: 1147; SSE41: # %bb.0: # %vector.ph 1148; SSE41-NEXT: pxor %xmm4, %xmm4 1149; SSE41-NEXT: movdqa %xmm0, %xmm5 1150; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 1151; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1152; SSE41-NEXT: pmaxud %xmm2, %xmm5 1153; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 1154; SSE41-NEXT: pmaxud %xmm1, %xmm3 1155; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 1156; SSE41-NEXT: packssdw %xmm5, %xmm3 1157; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 1158; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 1159; SSE41-NEXT: packusdw %xmm2, %xmm1 1160; SSE41-NEXT: psubw %xmm1, %xmm0 1161; SSE41-NEXT: pandn %xmm0, %xmm3 1162; SSE41-NEXT: movdqa %xmm3, %xmm0 1163; SSE41-NEXT: retq 1164; 1165; AVX1-LABEL: test16: 1166; AVX1: # %bb.0: # %vector.ph 1167; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1168; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1169; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1170; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1171; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm2 1172; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2 1173; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3 1174; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 1175; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 1176; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1177; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1178; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1179; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1180; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 1181; AVX1-NEXT: vzeroupper 1182; AVX1-NEXT: retq 1183; 1184; AVX2-LABEL: test16: 1185; AVX2: # %bb.0: # %vector.ph 1186; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1187; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2 1188; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 1189; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 1190; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 1191; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1192; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 1193; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1194; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1195; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1196; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 1197; AVX2-NEXT: vzeroupper 1198; AVX2-NEXT: retq 1199; 1200; AVX512-LABEL: test16: 1201; AVX512: # %bb.0: # %vector.ph 1202; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1203; AVX512-NEXT: vpcmpltud %ymm2, %ymm1, %k1 1204; AVX512-NEXT: vpmovdw %ymm1, %xmm1 1205; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} 1206; AVX512-NEXT: vzeroupper 1207; AVX512-NEXT: retq 1208vector.ph: 1209 %lhs = zext <8 x i16> %x to <8 x i32> 1210 %cond = icmp ult <8 x i32> %y, %lhs 1211 %sub = sub <8 x i32> %lhs, %y 1212 %truncsub = trunc <8 x i32> %sub to <8 x i16> 1213 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer 1214 ret <8 x i16> %res 1215} 1216 1217define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind { 1218; SSE2-LABEL: test17: 1219; SSE2: # %bb.0: # %vector.ph 1220; SSE2-NEXT: movd %edi, %xmm4 1221; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1222; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 1223; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1224; SSE2-NEXT: psubusb %xmm4, %xmm0 1225; SSE2-NEXT: psubusb %xmm4, %xmm1 1226; SSE2-NEXT: psubusb %xmm4, %xmm2 1227; SSE2-NEXT: psubusb %xmm4, %xmm3 1228; SSE2-NEXT: retq 1229; 1230; SSSE3-LABEL: test17: 1231; SSSE3: # %bb.0: # %vector.ph 1232; SSSE3-NEXT: movd %edi, %xmm4 1233; SSSE3-NEXT: pxor %xmm5, %xmm5 1234; SSSE3-NEXT: pshufb %xmm5, %xmm4 1235; SSSE3-NEXT: psubusb %xmm4, %xmm0 1236; SSSE3-NEXT: psubusb %xmm4, %xmm1 1237; SSSE3-NEXT: psubusb %xmm4, %xmm2 1238; SSSE3-NEXT: psubusb %xmm4, %xmm3 1239; SSSE3-NEXT: retq 1240; 1241; SSE41-LABEL: test17: 1242; SSE41: # %bb.0: # %vector.ph 1243; SSE41-NEXT: movd %edi, %xmm4 1244; SSE41-NEXT: pxor %xmm5, %xmm5 1245; SSE41-NEXT: pshufb %xmm5, %xmm4 1246; SSE41-NEXT: psubusb %xmm4, %xmm0 1247; SSE41-NEXT: psubusb %xmm4, %xmm1 1248; SSE41-NEXT: psubusb %xmm4, %xmm2 1249; SSE41-NEXT: psubusb %xmm4, %xmm3 1250; SSE41-NEXT: retq 1251; 1252; AVX1-LABEL: test17: 1253; AVX1: # %bb.0: # %vector.ph 1254; AVX1-NEXT: vmovd %edi, %xmm2 1255; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1256; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1257; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1258; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3 1259; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 1260; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1261; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1262; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3 1263; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 1264; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1265; AVX1-NEXT: retq 1266; 1267; AVX2-LABEL: test17: 1268; AVX2: # %bb.0: # %vector.ph 1269; AVX2-NEXT: vmovd %edi, %xmm2 1270; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1271; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 1272; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 1273; AVX2-NEXT: retq 1274; 1275; AVX512-LABEL: test17: 1276; AVX512: # %bb.0: # %vector.ph 1277; AVX512-NEXT: vpbroadcastb %edi, %zmm1 1278; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 1279; AVX512-NEXT: retq 1280vector.ph: 1281 %0 = insertelement <64 x i8> undef, i8 %w, i32 0 1282 %broadcast15 = shufflevector <64 x i8> %0, <64 x i8> undef, <64 x i32> zeroinitializer 1283 %1 = icmp ult <64 x i8> %x, %broadcast15 1284 %2 = sub <64 x i8> %x, %broadcast15 1285 %res = select <64 x i1> %1, <64 x i8> zeroinitializer, <64 x i8> %2 1286 ret <64 x i8> %res 1287} 1288 1289define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind { 1290; SSE-LABEL: test18: 1291; SSE: # %bb.0: # %vector.ph 1292; SSE-NEXT: movd %edi, %xmm4 1293; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 1294; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1295; SSE-NEXT: psubusw %xmm4, %xmm0 1296; SSE-NEXT: psubusw %xmm4, %xmm1 1297; SSE-NEXT: psubusw %xmm4, %xmm2 1298; SSE-NEXT: psubusw %xmm4, %xmm3 1299; SSE-NEXT: retq 1300; 1301; AVX1-LABEL: test18: 1302; AVX1: # %bb.0: # %vector.ph 1303; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1304; AVX1-NEXT: vmovd %edi, %xmm3 1305; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] 1306; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1307; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2 1308; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0 1309; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1310; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1311; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2 1312; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1 1313; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1314; AVX1-NEXT: retq 1315; 1316; AVX2-LABEL: test18: 1317; AVX2: # %bb.0: # %vector.ph 1318; AVX2-NEXT: vmovd %edi, %xmm2 1319; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 1320; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 1321; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 1322; AVX2-NEXT: retq 1323; 1324; AVX512-LABEL: test18: 1325; AVX512: # %bb.0: # %vector.ph 1326; AVX512-NEXT: vpbroadcastw %edi, %zmm1 1327; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 1328; AVX512-NEXT: retq 1329vector.ph: 1330 %0 = insertelement <32 x i16> undef, i16 %w, i32 0 1331 %broadcast15 = shufflevector <32 x i16> %0, <32 x i16> undef, <32 x i32> zeroinitializer 1332 %1 = icmp ult <32 x i16> %x, %broadcast15 1333 %2 = sub <32 x i16> %x, %broadcast15 1334 %res = select <32 x i1> %1, <32 x i16> zeroinitializer, <32 x i16> %2 1335 ret <32 x i16> %res 1336} 1337 1338define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind { 1339; SSE-LABEL: psubus_8i16_max: 1340; SSE: # %bb.0: # %vector.ph 1341; SSE-NEXT: psubusw %xmm1, %xmm0 1342; SSE-NEXT: retq 1343; 1344; AVX-LABEL: psubus_8i16_max: 1345; AVX: # %bb.0: # %vector.ph 1346; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1347; AVX-NEXT: retq 1348vector.ph: 1349 %cmp = icmp ult <8 x i16> %x, %y 1350 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x 1351 %res = sub <8 x i16> %max, %y 1352 ret <8 x i16> %res 1353} 1354 1355define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind { 1356; SSE-LABEL: psubus_16i8_max: 1357; SSE: # %bb.0: # %vector.ph 1358; SSE-NEXT: psubusb %xmm1, %xmm0 1359; SSE-NEXT: retq 1360; 1361; AVX-LABEL: psubus_16i8_max: 1362; AVX: # %bb.0: # %vector.ph 1363; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 1364; AVX-NEXT: retq 1365vector.ph: 1366 %cmp = icmp ult <16 x i8> %x, %y 1367 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x 1368 %res = sub <16 x i8> %max, %y 1369 ret <16 x i8> %res 1370} 1371 1372define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind { 1373; SSE-LABEL: psubus_16i16_max: 1374; SSE: # %bb.0: # %vector.ph 1375; SSE-NEXT: psubusw %xmm2, %xmm0 1376; SSE-NEXT: psubusw %xmm3, %xmm1 1377; SSE-NEXT: retq 1378; 1379; AVX1-LABEL: psubus_16i16_max: 1380; AVX1: # %bb.0: # %vector.ph 1381; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1382; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1383; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 1384; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1385; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1386; AVX1-NEXT: retq 1387; 1388; AVX2-LABEL: psubus_16i16_max: 1389; AVX2: # %bb.0: # %vector.ph 1390; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1391; AVX2-NEXT: retq 1392; 1393; AVX512-LABEL: psubus_16i16_max: 1394; AVX512: # %bb.0: # %vector.ph 1395; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1396; AVX512-NEXT: retq 1397vector.ph: 1398 %cmp = icmp ult <16 x i16> %x, %y 1399 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x 1400 %res = sub <16 x i16> %max, %y 1401 ret <16 x i16> %res 1402} 1403 1404define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind { 1405; SSE-LABEL: psubus_32i16_max: 1406; SSE: # %bb.0: # %vector.ph 1407; SSE-NEXT: psubusw %xmm4, %xmm0 1408; SSE-NEXT: psubusw %xmm5, %xmm1 1409; SSE-NEXT: psubusw %xmm6, %xmm2 1410; SSE-NEXT: psubusw %xmm7, %xmm3 1411; SSE-NEXT: retq 1412; 1413; AVX1-LABEL: psubus_32i16_max: 1414; AVX1: # %bb.0: # %vector.ph 1415; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1416; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1417; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4 1418; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 1419; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1420; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1421; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1422; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2 1423; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1 1424; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1425; AVX1-NEXT: retq 1426; 1427; AVX2-LABEL: psubus_32i16_max: 1428; AVX2: # %bb.0: # %vector.ph 1429; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 1430; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 1431; AVX2-NEXT: retq 1432; 1433; AVX512-LABEL: psubus_32i16_max: 1434; AVX512: # %bb.0: # %vector.ph 1435; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 1436; AVX512-NEXT: retq 1437vector.ph: 1438 %cmp = icmp ult <32 x i16> %x, %y 1439 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x 1440 %res = sub <32 x i16> %max, %y 1441 ret <32 x i16> %res 1442} 1443 1444define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind { 1445; SSE-LABEL: psubus_64i8_max: 1446; SSE: # %bb.0: # %vector.ph 1447; SSE-NEXT: psubusb %xmm4, %xmm0 1448; SSE-NEXT: psubusb %xmm5, %xmm1 1449; SSE-NEXT: psubusb %xmm6, %xmm2 1450; SSE-NEXT: psubusb %xmm7, %xmm3 1451; SSE-NEXT: retq 1452; 1453; AVX1-LABEL: psubus_64i8_max: 1454; AVX1: # %bb.0: # %vector.ph 1455; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1456; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1457; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4 1458; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 1459; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1460; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1461; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1462; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2 1463; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1 1464; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1465; AVX1-NEXT: retq 1466; 1467; AVX2-LABEL: psubus_64i8_max: 1468; AVX2: # %bb.0: # %vector.ph 1469; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 1470; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1 1471; AVX2-NEXT: retq 1472; 1473; AVX512-LABEL: psubus_64i8_max: 1474; AVX512: # %bb.0: # %vector.ph 1475; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 1476; AVX512-NEXT: retq 1477vector.ph: 1478 %cmp = icmp ult <64 x i8> %x, %y 1479 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 1480 %res = sub <64 x i8> %max, %y 1481 ret <64 x i8> %res 1482} 1483 1484define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind { 1485; SSE-LABEL: psubus_32i8_max: 1486; SSE: # %bb.0: # %vector.ph 1487; SSE-NEXT: psubusb %xmm2, %xmm0 1488; SSE-NEXT: psubusb %xmm3, %xmm1 1489; SSE-NEXT: retq 1490; 1491; AVX1-LABEL: psubus_32i8_max: 1492; AVX1: # %bb.0: # %vector.ph 1493; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1494; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1495; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2 1496; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 1497; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1498; AVX1-NEXT: retq 1499; 1500; AVX2-LABEL: psubus_32i8_max: 1501; AVX2: # %bb.0: # %vector.ph 1502; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 1503; AVX2-NEXT: retq 1504; 1505; AVX512-LABEL: psubus_32i8_max: 1506; AVX512: # %bb.0: # %vector.ph 1507; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 1508; AVX512-NEXT: retq 1509vector.ph: 1510 %cmp = icmp ult <32 x i8> %x, %y 1511 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x 1512 %res = sub <32 x i8> %max, %y 1513 ret <32 x i8> %res 1514} 1515 1516define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { 1517; SSE2-LABEL: psubus_8i32_max: 1518; SSE2: # %bb.0: # %vector.ph 1519; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1520; SSE2-NEXT: movdqa %xmm2, %xmm4 1521; SSE2-NEXT: pxor %xmm3, %xmm4 1522; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1523; SSE2-NEXT: movdqa %xmm5, %xmm6 1524; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 1525; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1526; SSE2-NEXT: pand %xmm6, %xmm2 1527; SSE2-NEXT: pxor %xmm4, %xmm6 1528; SSE2-NEXT: por %xmm2, %xmm6 1529; SSE2-NEXT: pslld $16, %xmm6 1530; SSE2-NEXT: psrad $16, %xmm6 1531; SSE2-NEXT: pxor %xmm1, %xmm3 1532; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 1533; SSE2-NEXT: pxor %xmm5, %xmm4 1534; SSE2-NEXT: pand %xmm1, %xmm5 1535; SSE2-NEXT: por %xmm4, %xmm5 1536; SSE2-NEXT: pslld $16, %xmm5 1537; SSE2-NEXT: psrad $16, %xmm5 1538; SSE2-NEXT: packssdw %xmm6, %xmm5 1539; SSE2-NEXT: psubusw %xmm5, %xmm0 1540; SSE2-NEXT: retq 1541; 1542; SSSE3-LABEL: psubus_8i32_max: 1543; SSSE3: # %bb.0: # %vector.ph 1544; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1545; SSSE3-NEXT: movdqa %xmm2, %xmm4 1546; SSSE3-NEXT: pxor %xmm3, %xmm4 1547; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1548; SSSE3-NEXT: movdqa %xmm5, %xmm6 1549; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 1550; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 1551; SSSE3-NEXT: pand %xmm6, %xmm2 1552; SSSE3-NEXT: pxor %xmm4, %xmm6 1553; SSSE3-NEXT: por %xmm2, %xmm6 1554; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1555; SSSE3-NEXT: pshufb %xmm2, %xmm6 1556; SSSE3-NEXT: pxor %xmm1, %xmm3 1557; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 1558; SSSE3-NEXT: pxor %xmm5, %xmm4 1559; SSSE3-NEXT: pand %xmm1, %xmm5 1560; SSSE3-NEXT: por %xmm4, %xmm5 1561; SSSE3-NEXT: pshufb %xmm2, %xmm5 1562; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 1563; SSSE3-NEXT: psubusw %xmm5, %xmm0 1564; SSSE3-NEXT: retq 1565; 1566; SSE41-LABEL: psubus_8i32_max: 1567; SSE41: # %bb.0: # %vector.ph 1568; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 1569; SSE41-NEXT: pminud %xmm3, %xmm2 1570; SSE41-NEXT: pminud %xmm3, %xmm1 1571; SSE41-NEXT: packusdw %xmm2, %xmm1 1572; SSE41-NEXT: psubusw %xmm1, %xmm0 1573; SSE41-NEXT: retq 1574; 1575; AVX1-LABEL: psubus_8i32_max: 1576; AVX1: # %bb.0: # %vector.ph 1577; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1578; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 1579; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 1580; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 1581; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1582; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1583; AVX1-NEXT: vzeroupper 1584; AVX1-NEXT: retq 1585; 1586; AVX2-LABEL: psubus_8i32_max: 1587; AVX2: # %bb.0: # %vector.ph 1588; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 1589; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 1590; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1591; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1592; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1593; AVX2-NEXT: vzeroupper 1594; AVX2-NEXT: retq 1595; 1596; AVX512-LABEL: psubus_8i32_max: 1597; AVX512: # %bb.0: # %vector.ph 1598; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 1599; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1600; AVX512-NEXT: vzeroupper 1601; AVX512-NEXT: retq 1602vector.ph: 1603 %lhs = zext <8 x i16> %x to <8 x i32> 1604 %cond = icmp ult <8 x i32> %lhs, %y 1605 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs 1606 %sub = sub <8 x i32> %max, %y 1607 %res = trunc <8 x i32> %sub to <8 x i16> 1608 ret <8 x i16> %res 1609} 1610 1611define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { 1612; SSE2OR3-LABEL: psubus_8i64_max: 1613; SSE2OR3: # %bb.0: # %vector.ph 1614; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] 1615; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 1616; SSE2OR3-NEXT: pxor %xmm5, %xmm7 1617; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] 1618; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1619; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 1620; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 1621; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] 1622; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 1623; SSE2OR3-NEXT: pand %xmm9, %xmm8 1624; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7 1625; SSE2OR3-NEXT: pand %xmm8, %xmm4 1626; SSE2OR3-NEXT: pxor %xmm7, %xmm8 1627; SSE2OR3-NEXT: por %xmm4, %xmm8 1628; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 1629; SSE2OR3-NEXT: pxor %xmm5, %xmm4 1630; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] 1631; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 1632; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 1633; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1634; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4 1635; SSE2OR3-NEXT: pand %xmm10, %xmm4 1636; SSE2OR3-NEXT: pand %xmm4, %xmm3 1637; SSE2OR3-NEXT: pxor %xmm7, %xmm4 1638; SSE2OR3-NEXT: por %xmm3, %xmm4 1639; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm8[0,2] 1640; SSE2OR3-NEXT: pslld $16, %xmm4 1641; SSE2OR3-NEXT: psrad $16, %xmm4 1642; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 1643; SSE2OR3-NEXT: pxor %xmm5, %xmm3 1644; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] 1645; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 1646; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 1647; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1648; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm3 1649; SSE2OR3-NEXT: pand %xmm9, %xmm3 1650; SSE2OR3-NEXT: pand %xmm3, %xmm2 1651; SSE2OR3-NEXT: pxor %xmm7, %xmm3 1652; SSE2OR3-NEXT: por %xmm2, %xmm3 1653; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 1654; SSE2OR3-NEXT: pxor %xmm5, %xmm2 1655; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] 1656; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 1657; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1658; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2 1659; SSE2OR3-NEXT: pand %xmm6, %xmm2 1660; SSE2OR3-NEXT: pxor %xmm2, %xmm7 1661; SSE2OR3-NEXT: pand %xmm1, %xmm2 1662; SSE2OR3-NEXT: por %xmm7, %xmm2 1663; SSE2OR3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1664; SSE2OR3-NEXT: pslld $16, %xmm2 1665; SSE2OR3-NEXT: psrad $16, %xmm2 1666; SSE2OR3-NEXT: packssdw %xmm4, %xmm2 1667; SSE2OR3-NEXT: psubusw %xmm2, %xmm0 1668; SSE2OR3-NEXT: retq 1669; 1670; SSE41-LABEL: psubus_8i64_max: 1671; SSE41: # %bb.0: # %vector.ph 1672; SSE41-NEXT: movdqa %xmm0, %xmm5 1673; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] 1674; SSE41-NEXT: movdqa %xmm4, %xmm8 1675; SSE41-NEXT: pxor %xmm9, %xmm8 1676; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] 1677; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] 1678; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 1679; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1680; SSE41-NEXT: movdqa %xmm6, %xmm0 1681; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 1682; SSE41-NEXT: pand %xmm8, %xmm0 1683; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] 1684; SSE41-NEXT: movapd %xmm8, %xmm10 1685; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 1686; SSE41-NEXT: movdqa %xmm3, %xmm4 1687; SSE41-NEXT: pxor %xmm9, %xmm4 1688; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] 1689; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 1690; SSE41-NEXT: movdqa %xmm6, %xmm0 1691; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 1692; SSE41-NEXT: pand %xmm4, %xmm0 1693; SSE41-NEXT: movapd %xmm8, %xmm4 1694; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 1695; SSE41-NEXT: packusdw %xmm10, %xmm4 1696; SSE41-NEXT: movdqa %xmm2, %xmm3 1697; SSE41-NEXT: pxor %xmm9, %xmm3 1698; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] 1699; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 1700; SSE41-NEXT: movdqa %xmm6, %xmm0 1701; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 1702; SSE41-NEXT: pand %xmm3, %xmm0 1703; SSE41-NEXT: movapd %xmm8, %xmm3 1704; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 1705; SSE41-NEXT: pxor %xmm1, %xmm9 1706; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 1707; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] 1708; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1709; SSE41-NEXT: pand %xmm7, %xmm6 1710; SSE41-NEXT: movdqa %xmm6, %xmm0 1711; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 1712; SSE41-NEXT: packusdw %xmm3, %xmm8 1713; SSE41-NEXT: packusdw %xmm4, %xmm8 1714; SSE41-NEXT: psubusw %xmm8, %xmm5 1715; SSE41-NEXT: movdqa %xmm5, %xmm0 1716; SSE41-NEXT: retq 1717; 1718; AVX1-LABEL: psubus_8i64_max: 1719; AVX1: # %bb.0: # %vector.ph 1720; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1721; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 1722; AVX1-NEXT: # xmm4 = mem[0,0] 1723; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 1724; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] 1725; AVX1-NEXT: # xmm6 = mem[0,0] 1726; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 1727; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] 1728; AVX1-NEXT: # xmm7 = mem[0,0] 1729; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 1730; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 1731; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 1732; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 1733; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1734; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1735; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 1736; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 1737; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 1738; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 1739; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 1740; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1 1741; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1742; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1743; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1744; AVX1-NEXT: vzeroupper 1745; AVX1-NEXT: retq 1746; 1747; AVX2-LABEL: psubus_8i64_max: 1748; AVX2: # %bb.0: # %vector.ph 1749; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 1750; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 1751; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] 1752; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 1753; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535] 1754; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 1755; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 1756; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 1757; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 1758; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 1759; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1760; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1761; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 1762; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1763; AVX2-NEXT: vzeroupper 1764; AVX2-NEXT: retq 1765; 1766; AVX512-LABEL: psubus_8i64_max: 1767; AVX512: # %bb.0: # %vector.ph 1768; AVX512-NEXT: vpmovusqw %zmm1, %xmm1 1769; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1770; AVX512-NEXT: vzeroupper 1771; AVX512-NEXT: retq 1772vector.ph: 1773 %lhs = zext <8 x i16> %x to <8 x i64> 1774 %cond = icmp ult <8 x i64> %lhs, %y 1775 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs 1776 %sub = sub <8 x i64> %max, %y 1777 %res = trunc <8 x i64> %sub to <8 x i16> 1778 ret <8 x i16> %res 1779} 1780 1781define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { 1782; SSE2-LABEL: psubus_16i32_max: 1783; SSE2: # %bb.0: # %vector.ph 1784; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] 1785; SSE2-NEXT: movdqa %xmm3, %xmm8 1786; SSE2-NEXT: pxor %xmm7, %xmm8 1787; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1788; SSE2-NEXT: movdqa %xmm6, %xmm9 1789; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 1790; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 1791; SSE2-NEXT: pand %xmm9, %xmm3 1792; SSE2-NEXT: pxor %xmm8, %xmm9 1793; SSE2-NEXT: por %xmm3, %xmm9 1794; SSE2-NEXT: pslld $16, %xmm9 1795; SSE2-NEXT: psrad $16, %xmm9 1796; SSE2-NEXT: movdqa %xmm2, %xmm3 1797; SSE2-NEXT: pxor %xmm7, %xmm3 1798; SSE2-NEXT: movdqa %xmm6, %xmm10 1799; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 1800; SSE2-NEXT: pand %xmm10, %xmm2 1801; SSE2-NEXT: pxor %xmm8, %xmm10 1802; SSE2-NEXT: por %xmm2, %xmm10 1803; SSE2-NEXT: pslld $16, %xmm10 1804; SSE2-NEXT: psrad $16, %xmm10 1805; SSE2-NEXT: packssdw %xmm9, %xmm10 1806; SSE2-NEXT: psubusw %xmm10, %xmm0 1807; SSE2-NEXT: movdqa %xmm5, %xmm2 1808; SSE2-NEXT: pxor %xmm7, %xmm2 1809; SSE2-NEXT: movdqa %xmm6, %xmm3 1810; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1811; SSE2-NEXT: pand %xmm3, %xmm5 1812; SSE2-NEXT: pxor %xmm8, %xmm3 1813; SSE2-NEXT: por %xmm5, %xmm3 1814; SSE2-NEXT: pslld $16, %xmm3 1815; SSE2-NEXT: psrad $16, %xmm3 1816; SSE2-NEXT: pxor %xmm4, %xmm7 1817; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 1818; SSE2-NEXT: pxor %xmm6, %xmm8 1819; SSE2-NEXT: pand %xmm4, %xmm6 1820; SSE2-NEXT: por %xmm8, %xmm6 1821; SSE2-NEXT: pslld $16, %xmm6 1822; SSE2-NEXT: psrad $16, %xmm6 1823; SSE2-NEXT: packssdw %xmm3, %xmm6 1824; SSE2-NEXT: psubusw %xmm6, %xmm1 1825; SSE2-NEXT: retq 1826; 1827; SSSE3-LABEL: psubus_16i32_max: 1828; SSSE3: # %bb.0: # %vector.ph 1829; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] 1830; SSSE3-NEXT: movdqa %xmm3, %xmm8 1831; SSSE3-NEXT: pxor %xmm7, %xmm8 1832; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1833; SSSE3-NEXT: movdqa %xmm6, %xmm9 1834; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 1835; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 1836; SSSE3-NEXT: pand %xmm9, %xmm3 1837; SSSE3-NEXT: pxor %xmm8, %xmm9 1838; SSSE3-NEXT: por %xmm3, %xmm9 1839; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1840; SSSE3-NEXT: pshufb %xmm3, %xmm9 1841; SSSE3-NEXT: movdqa %xmm2, %xmm10 1842; SSSE3-NEXT: pxor %xmm7, %xmm10 1843; SSSE3-NEXT: movdqa %xmm6, %xmm11 1844; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 1845; SSSE3-NEXT: pand %xmm11, %xmm2 1846; SSSE3-NEXT: pxor %xmm8, %xmm11 1847; SSSE3-NEXT: por %xmm2, %xmm11 1848; SSSE3-NEXT: pshufb %xmm3, %xmm11 1849; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm9[0] 1850; SSSE3-NEXT: psubusw %xmm11, %xmm0 1851; SSSE3-NEXT: movdqa %xmm5, %xmm2 1852; SSSE3-NEXT: pxor %xmm7, %xmm2 1853; SSSE3-NEXT: movdqa %xmm6, %xmm9 1854; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 1855; SSSE3-NEXT: pand %xmm9, %xmm5 1856; SSSE3-NEXT: pxor %xmm8, %xmm9 1857; SSSE3-NEXT: por %xmm5, %xmm9 1858; SSSE3-NEXT: pshufb %xmm3, %xmm9 1859; SSSE3-NEXT: pxor %xmm4, %xmm7 1860; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 1861; SSSE3-NEXT: pxor %xmm6, %xmm8 1862; SSSE3-NEXT: pand %xmm4, %xmm6 1863; SSSE3-NEXT: por %xmm8, %xmm6 1864; SSSE3-NEXT: pshufb %xmm3, %xmm6 1865; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] 1866; SSSE3-NEXT: psubusw %xmm6, %xmm1 1867; SSSE3-NEXT: retq 1868; 1869; SSE41-LABEL: psubus_16i32_max: 1870; SSE41: # %bb.0: # %vector.ph 1871; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [65535,0,65535,0,65535,0,65535,0] 1872; SSE41-NEXT: pminud %xmm6, %xmm3 1873; SSE41-NEXT: pminud %xmm6, %xmm2 1874; SSE41-NEXT: packusdw %xmm3, %xmm2 1875; SSE41-NEXT: psubusw %xmm2, %xmm0 1876; SSE41-NEXT: pminud %xmm6, %xmm5 1877; SSE41-NEXT: pminud %xmm6, %xmm4 1878; SSE41-NEXT: packusdw %xmm5, %xmm4 1879; SSE41-NEXT: psubusw %xmm4, %xmm1 1880; SSE41-NEXT: retq 1881; 1882; AVX1-LABEL: psubus_16i32_max: 1883; AVX1: # %bb.0: # %vector.ph 1884; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1885; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535] 1886; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 1887; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 1888; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1889; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1890; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 1891; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1892; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 1893; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 1894; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1895; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1896; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1897; AVX1-NEXT: retq 1898; 1899; AVX2-LABEL: psubus_16i32_max: 1900; AVX2: # %bb.0: # %vector.ph 1901; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] 1902; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 1903; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 1904; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 1905; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 1906; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1907; AVX2-NEXT: retq 1908; 1909; AVX512-LABEL: psubus_16i32_max: 1910; AVX512: # %bb.0: # %vector.ph 1911; AVX512-NEXT: vpmovusdw %zmm1, %ymm1 1912; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1913; AVX512-NEXT: retq 1914vector.ph: 1915 %lhs = zext <16 x i16> %x to <16 x i32> 1916 %cond = icmp ult <16 x i32> %lhs, %y 1917 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs 1918 %sub = sub <16 x i32> %max, %y 1919 %res = trunc <16 x i32> %sub to <16 x i16> 1920 ret <16 x i16> %res 1921} 1922 1923define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind { 1924; SSE2-LABEL: psubus_i16_i32_max_swapped: 1925; SSE2: # %bb.0: # %vector.ph 1926; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1927; SSE2-NEXT: movdqa %xmm2, %xmm4 1928; SSE2-NEXT: pxor %xmm3, %xmm4 1929; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1930; SSE2-NEXT: movdqa %xmm5, %xmm6 1931; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 1932; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1933; SSE2-NEXT: pand %xmm6, %xmm2 1934; SSE2-NEXT: pxor %xmm4, %xmm6 1935; SSE2-NEXT: por %xmm2, %xmm6 1936; SSE2-NEXT: pslld $16, %xmm6 1937; SSE2-NEXT: psrad $16, %xmm6 1938; SSE2-NEXT: pxor %xmm1, %xmm3 1939; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 1940; SSE2-NEXT: pxor %xmm5, %xmm4 1941; SSE2-NEXT: pand %xmm1, %xmm5 1942; SSE2-NEXT: por %xmm4, %xmm5 1943; SSE2-NEXT: pslld $16, %xmm5 1944; SSE2-NEXT: psrad $16, %xmm5 1945; SSE2-NEXT: packssdw %xmm6, %xmm5 1946; SSE2-NEXT: psubusw %xmm5, %xmm0 1947; SSE2-NEXT: retq 1948; 1949; SSSE3-LABEL: psubus_i16_i32_max_swapped: 1950; SSSE3: # %bb.0: # %vector.ph 1951; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 1952; SSSE3-NEXT: movdqa %xmm2, %xmm4 1953; SSSE3-NEXT: pxor %xmm3, %xmm4 1954; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 1955; SSSE3-NEXT: movdqa %xmm5, %xmm6 1956; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 1957; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 1958; SSSE3-NEXT: pand %xmm6, %xmm2 1959; SSSE3-NEXT: pxor %xmm4, %xmm6 1960; SSSE3-NEXT: por %xmm2, %xmm6 1961; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1962; SSSE3-NEXT: pshufb %xmm2, %xmm6 1963; SSSE3-NEXT: pxor %xmm1, %xmm3 1964; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 1965; SSSE3-NEXT: pxor %xmm5, %xmm4 1966; SSSE3-NEXT: pand %xmm1, %xmm5 1967; SSSE3-NEXT: por %xmm4, %xmm5 1968; SSSE3-NEXT: pshufb %xmm2, %xmm5 1969; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 1970; SSSE3-NEXT: psubusw %xmm5, %xmm0 1971; SSSE3-NEXT: retq 1972; 1973; SSE41-LABEL: psubus_i16_i32_max_swapped: 1974; SSE41: # %bb.0: # %vector.ph 1975; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 1976; SSE41-NEXT: pminud %xmm3, %xmm2 1977; SSE41-NEXT: pminud %xmm3, %xmm1 1978; SSE41-NEXT: packusdw %xmm2, %xmm1 1979; SSE41-NEXT: psubusw %xmm1, %xmm0 1980; SSE41-NEXT: retq 1981; 1982; AVX1-LABEL: psubus_i16_i32_max_swapped: 1983; AVX1: # %bb.0: # %vector.ph 1984; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1985; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 1986; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 1987; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 1988; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1989; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1990; AVX1-NEXT: vzeroupper 1991; AVX1-NEXT: retq 1992; 1993; AVX2-LABEL: psubus_i16_i32_max_swapped: 1994; AVX2: # %bb.0: # %vector.ph 1995; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 1996; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 1997; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1998; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1999; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2000; AVX2-NEXT: vzeroupper 2001; AVX2-NEXT: retq 2002; 2003; AVX512-LABEL: psubus_i16_i32_max_swapped: 2004; AVX512: # %bb.0: # %vector.ph 2005; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 2006; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2007; AVX512-NEXT: vzeroupper 2008; AVX512-NEXT: retq 2009vector.ph: 2010 %lhs = zext <8 x i16> %x to <8 x i32> 2011 %cond = icmp ult <8 x i32> %y, %lhs 2012 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y 2013 %sub = sub <8 x i32> %max, %y 2014 %res = trunc <8 x i32> %sub to <8 x i16> 2015 ret <8 x i16> %res 2016} 2017 2018define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { 2019; SSE2-LABEL: psubus_i16_i32_min: 2020; SSE2: # %bb.0: # %vector.ph 2021; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 2022; SSE2-NEXT: movdqa %xmm2, %xmm4 2023; SSE2-NEXT: pxor %xmm3, %xmm4 2024; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 2025; SSE2-NEXT: movdqa %xmm5, %xmm6 2026; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 2027; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 2028; SSE2-NEXT: pand %xmm6, %xmm2 2029; SSE2-NEXT: pxor %xmm4, %xmm6 2030; SSE2-NEXT: por %xmm2, %xmm6 2031; SSE2-NEXT: pslld $16, %xmm6 2032; SSE2-NEXT: psrad $16, %xmm6 2033; SSE2-NEXT: pxor %xmm1, %xmm3 2034; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 2035; SSE2-NEXT: pxor %xmm5, %xmm4 2036; SSE2-NEXT: pand %xmm1, %xmm5 2037; SSE2-NEXT: por %xmm4, %xmm5 2038; SSE2-NEXT: pslld $16, %xmm5 2039; SSE2-NEXT: psrad $16, %xmm5 2040; SSE2-NEXT: packssdw %xmm6, %xmm5 2041; SSE2-NEXT: psubusw %xmm5, %xmm0 2042; SSE2-NEXT: retq 2043; 2044; SSSE3-LABEL: psubus_i16_i32_min: 2045; SSSE3: # %bb.0: # %vector.ph 2046; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 2047; SSSE3-NEXT: movdqa %xmm2, %xmm4 2048; SSSE3-NEXT: pxor %xmm3, %xmm4 2049; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 2050; SSSE3-NEXT: movdqa %xmm5, %xmm6 2051; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 2052; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 2053; SSSE3-NEXT: pand %xmm6, %xmm2 2054; SSSE3-NEXT: pxor %xmm4, %xmm6 2055; SSSE3-NEXT: por %xmm2, %xmm6 2056; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2057; SSSE3-NEXT: pshufb %xmm2, %xmm6 2058; SSSE3-NEXT: pxor %xmm1, %xmm3 2059; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 2060; SSSE3-NEXT: pxor %xmm5, %xmm4 2061; SSSE3-NEXT: pand %xmm1, %xmm5 2062; SSSE3-NEXT: por %xmm4, %xmm5 2063; SSSE3-NEXT: pshufb %xmm2, %xmm5 2064; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 2065; SSSE3-NEXT: psubusw %xmm5, %xmm0 2066; SSSE3-NEXT: retq 2067; 2068; SSE41-LABEL: psubus_i16_i32_min: 2069; SSE41: # %bb.0: # %vector.ph 2070; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 2071; SSE41-NEXT: pminud %xmm3, %xmm2 2072; SSE41-NEXT: pminud %xmm3, %xmm1 2073; SSE41-NEXT: packusdw %xmm2, %xmm1 2074; SSE41-NEXT: psubusw %xmm1, %xmm0 2075; SSE41-NEXT: retq 2076; 2077; AVX1-LABEL: psubus_i16_i32_min: 2078; AVX1: # %bb.0: # %vector.ph 2079; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2080; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 2081; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 2082; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 2083; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2084; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2085; AVX1-NEXT: vzeroupper 2086; AVX1-NEXT: retq 2087; 2088; AVX2-LABEL: psubus_i16_i32_min: 2089; AVX2: # %bb.0: # %vector.ph 2090; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 2091; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 2092; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2093; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2094; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2095; AVX2-NEXT: vzeroupper 2096; AVX2-NEXT: retq 2097; 2098; AVX512-LABEL: psubus_i16_i32_min: 2099; AVX512: # %bb.0: # %vector.ph 2100; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 2101; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2102; AVX512-NEXT: vzeroupper 2103; AVX512-NEXT: retq 2104vector.ph: 2105 %lhs = zext <8 x i16> %x to <8 x i32> 2106 %cond = icmp ult <8 x i32> %lhs, %y 2107 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y 2108 %sub = sub <8 x i32> %lhs, %min 2109 %res = trunc <8 x i32> %sub to <8 x i16> 2110 ret <8 x i16> %res 2111} 2112 2113define void @subus_v8i8(ptr %p1, ptr %p2) { 2114; SSE-LABEL: subus_v8i8: 2115; SSE: # %bb.0: 2116; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2117; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2118; SSE-NEXT: psubusb %xmm1, %xmm0 2119; SSE-NEXT: movq %xmm0, (%rdi) 2120; SSE-NEXT: retq 2121; 2122; AVX-LABEL: subus_v8i8: 2123; AVX: # %bb.0: 2124; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2125; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 2126; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 2127; AVX-NEXT: vmovq %xmm0, (%rdi) 2128; AVX-NEXT: retq 2129 %ld1 = load <8 x i8>, ptr %p1, align 8 2130 %ld2 = load <8 x i8>, ptr %p2, align 8 2131 %1 = sub <8 x i8> %ld1, %ld2 2132 %2 = icmp ugt <8 x i8> %ld1, %ld2 2133 %sh3 = select <8 x i1> %2, <8 x i8> %1, <8 x i8> zeroinitializer 2134 store <8 x i8> %sh3, ptr %p1, align 8 2135 ret void 2136} 2137 2138define void @subus_v4i8(ptr %p1, ptr %p2) { 2139; SSE-LABEL: subus_v4i8: 2140; SSE: # %bb.0: 2141; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2142; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2143; SSE-NEXT: psubusb %xmm1, %xmm0 2144; SSE-NEXT: movd %xmm0, (%rdi) 2145; SSE-NEXT: retq 2146; 2147; AVX-LABEL: subus_v4i8: 2148; AVX: # %bb.0: 2149; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2150; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 2151; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 2152; AVX-NEXT: vmovd %xmm0, (%rdi) 2153; AVX-NEXT: retq 2154 %ld1 = load <4 x i8>, ptr %p1, align 8 2155 %ld2 = load <4 x i8>, ptr %p2, align 8 2156 %1 = sub <4 x i8> %ld1, %ld2 2157 %2 = icmp ugt <4 x i8> %ld1, %ld2 2158 %sh3 = select <4 x i1> %2, <4 x i8> %1, <4 x i8> zeroinitializer 2159 store <4 x i8> %sh3, ptr %p1, align 8 2160 ret void 2161} 2162 2163define void @subus_v2i8(ptr %p1, ptr %p2) { 2164; SSE2OR3-LABEL: subus_v2i8: 2165; SSE2OR3: # %bb.0: 2166; SSE2OR3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2167; SSE2OR3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2168; SSE2OR3-NEXT: psubusb %xmm1, %xmm0 2169; SSE2OR3-NEXT: movd %xmm0, %eax 2170; SSE2OR3-NEXT: movw %ax, (%rdi) 2171; SSE2OR3-NEXT: retq 2172; 2173; SSE41-LABEL: subus_v2i8: 2174; SSE41: # %bb.0: 2175; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2176; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2177; SSE41-NEXT: psubusb %xmm1, %xmm0 2178; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) 2179; SSE41-NEXT: retq 2180; 2181; AVX-LABEL: subus_v2i8: 2182; AVX: # %bb.0: 2183; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2184; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 2185; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 2186; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) 2187; AVX-NEXT: retq 2188 %ld1 = load <2 x i8>, ptr %p1, align 8 2189 %ld2 = load <2 x i8>, ptr %p2, align 8 2190 %1 = sub <2 x i8> %ld1, %ld2 2191 %2 = icmp ugt <2 x i8> %ld1, %ld2 2192 %sh3 = select <2 x i1> %2, <2 x i8> %1, <2 x i8> zeroinitializer 2193 store <2 x i8> %sh3, ptr %p1, align 8 2194 ret void 2195} 2196 2197define void @subus_v4i16(ptr %p1, ptr %p2) { 2198; SSE-LABEL: subus_v4i16: 2199; SSE: # %bb.0: 2200; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2201; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2202; SSE-NEXT: psubusw %xmm1, %xmm0 2203; SSE-NEXT: movq %xmm0, (%rdi) 2204; SSE-NEXT: retq 2205; 2206; AVX-LABEL: subus_v4i16: 2207; AVX: # %bb.0: 2208; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2209; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 2210; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2211; AVX-NEXT: vmovq %xmm0, (%rdi) 2212; AVX-NEXT: retq 2213 %ld1 = load <4 x i16>, ptr %p1, align 8 2214 %ld2 = load <4 x i16>, ptr %p2, align 8 2215 %1 = sub <4 x i16> %ld1, %ld2 2216 %2 = icmp ugt <4 x i16> %ld1, %ld2 2217 %sh3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> zeroinitializer 2218 store <4 x i16> %sh3, ptr %p1, align 8 2219 ret void 2220} 2221 2222define void @subus_v2i16(ptr %p1, ptr %p2) { 2223; SSE-LABEL: subus_v2i16: 2224; SSE: # %bb.0: 2225; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2226; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2227; SSE-NEXT: psubusw %xmm1, %xmm0 2228; SSE-NEXT: movd %xmm0, (%rdi) 2229; SSE-NEXT: retq 2230; 2231; AVX-LABEL: subus_v2i16: 2232; AVX: # %bb.0: 2233; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2234; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 2235; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2236; AVX-NEXT: vmovd %xmm0, (%rdi) 2237; AVX-NEXT: retq 2238 %ld1 = load <2 x i16>, ptr %p1, align 8 2239 %ld2 = load <2 x i16>, ptr %p2, align 8 2240 %1 = sub <2 x i16> %ld1, %ld2 2241 %2 = icmp ugt <2 x i16> %ld1, %ld2 2242 %sh3 = select <2 x i1> %2, <2 x i16> %1, <2 x i16> zeroinitializer 2243 store <2 x i16> %sh3, ptr %p1, align 8 2244 ret void 2245} 2246 2247define <16 x i8> @test19(<16 x i8> %x) { 2248; SSE-LABEL: test19: 2249; SSE: # %bb.0: # %entry 2250; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2251; SSE-NEXT: retq 2252; 2253; AVX-LABEL: test19: 2254; AVX: # %bb.0: # %entry 2255; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2256; AVX-NEXT: retq 2257entry: 2258 %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70> 2259 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70> 2260 %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70> 2261 ret <16 x i8> %2 2262} 2263 2264define <16 x i8> @test20(<16 x i8> %x) { 2265; SSE-LABEL: test20: 2266; SSE: # %bb.0: # %entry 2267; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2268; SSE-NEXT: retq 2269; 2270; AVX-LABEL: test20: 2271; AVX: # %bb.0: # %entry 2272; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2273; AVX-NEXT: retq 2274entry: 2275 %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70> 2276 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70> 2277 %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70> 2278 ret <16 x i8> %2 2279} 2280 2281define <8 x i16> @test21(<8 x i16> %x) { 2282; SSE-LABEL: test21: 2283; SSE: # %bb.0: # %entry 2284; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2285; SSE-NEXT: retq 2286; 2287; AVX-LABEL: test21: 2288; AVX: # %bb.0: # %entry 2289; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2290; AVX-NEXT: retq 2291entry: 2292 %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700> 2293 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700> 2294 %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700> 2295 ret <8 x i16> %2 2296} 2297 2298define <8 x i16> @test22(<8 x i16> %x) { 2299; SSE-LABEL: test22: 2300; SSE: # %bb.0: # %entry 2301; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2302; SSE-NEXT: retq 2303; 2304; AVX-LABEL: test22: 2305; AVX: # %bb.0: # %entry 2306; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2307; AVX-NEXT: retq 2308entry: 2309 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70> 2310 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70> 2311 %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70> 2312 ret <8 x i16> %2 2313} 2314 2315define <32 x i8> @test23(<32 x i8> %x) { 2316; SSE-LABEL: test23: 2317; SSE: # %bb.0: # %entry 2318; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] 2319; SSE-NEXT: psubusb %xmm2, %xmm0 2320; SSE-NEXT: psubusb %xmm2, %xmm1 2321; SSE-NEXT: retq 2322; 2323; AVX1-LABEL: test23: 2324; AVX1: # %bb.0: # %entry 2325; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2326; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] 2327; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 2328; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 2329; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2330; AVX1-NEXT: retq 2331; 2332; AVX2-LABEL: test23: 2333; AVX2: # %bb.0: # %entry 2334; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2335; AVX2-NEXT: retq 2336; 2337; AVX512-LABEL: test23: 2338; AVX512: # %bb.0: # %entry 2339; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2340; AVX512-NEXT: retq 2341entry: 2342 %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70> 2343 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70> 2344 %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70> 2345 ret <32 x i8> %2 2346} 2347 2348define <32 x i8> @test24(<32 x i8> %x) { 2349; SSE-LABEL: test24: 2350; SSE: # %bb.0: # %entry 2351; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2352; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2353; SSE-NEXT: retq 2354; 2355; AVX1-LABEL: test24: 2356; AVX1: # %bb.0: # %entry 2357; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2358; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2359; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2360; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2361; AVX1-NEXT: retq 2362; 2363; AVX2-LABEL: test24: 2364; AVX2: # %bb.0: # %entry 2365; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2366; AVX2-NEXT: retq 2367; 2368; AVX512-LABEL: test24: 2369; AVX512: # %bb.0: # %entry 2370; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2371; AVX512-NEXT: retq 2372entry: 2373 %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70> 2374 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70> 2375 %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70> 2376 ret <32 x i8> %2 2377} 2378 2379define <16 x i16> @test25(<16 x i16> %x) { 2380; SSE-LABEL: test25: 2381; SSE: # %bb.0: # %entry 2382; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] 2383; SSE-NEXT: psubusw %xmm2, %xmm0 2384; SSE-NEXT: psubusw %xmm2, %xmm1 2385; SSE-NEXT: retq 2386; 2387; AVX1-LABEL: test25: 2388; AVX1: # %bb.0: # %entry 2389; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2390; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] 2391; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 2392; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 2393; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2394; AVX1-NEXT: retq 2395; 2396; AVX2-LABEL: test25: 2397; AVX2: # %bb.0: # %entry 2398; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2399; AVX2-NEXT: retq 2400; 2401; AVX512-LABEL: test25: 2402; AVX512: # %bb.0: # %entry 2403; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2404; AVX512-NEXT: retq 2405entry: 2406 %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000> 2407 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000> 2408 %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000> 2409 ret <16 x i16> %2 2410} 2411 2412define <16 x i16> @test26(<16 x i16> %x) { 2413; SSE-LABEL: test26: 2414; SSE: # %bb.0: # %entry 2415; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2416; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2417; SSE-NEXT: retq 2418; 2419; AVX1-LABEL: test26: 2420; AVX1: # %bb.0: # %entry 2421; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2422; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2423; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2424; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2425; AVX1-NEXT: retq 2426; 2427; AVX2-LABEL: test26: 2428; AVX2: # %bb.0: # %entry 2429; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2430; AVX2-NEXT: retq 2431; 2432; AVX512-LABEL: test26: 2433; AVX512: # %bb.0: # %entry 2434; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2435; AVX512-NEXT: retq 2436entry: 2437 %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70> 2438 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70> 2439 %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70> 2440 ret <16 x i16> %2 2441} 2442 2443define <64 x i8> @test27(<64 x i8> %x) { 2444; SSE-LABEL: test27: 2445; SSE: # %bb.0: # %entry 2446; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] 2447; SSE-NEXT: psubusb %xmm4, %xmm0 2448; SSE-NEXT: psubusb %xmm4, %xmm1 2449; SSE-NEXT: psubusb %xmm4, %xmm2 2450; SSE-NEXT: psubusb %xmm4, %xmm3 2451; SSE-NEXT: retq 2452; 2453; AVX1-LABEL: test27: 2454; AVX1: # %bb.0: # %entry 2455; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2456; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] 2457; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2 2458; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0 2459; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2460; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2461; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2 2462; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1 2463; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2464; AVX1-NEXT: retq 2465; 2466; AVX2-LABEL: test27: 2467; AVX2: # %bb.0: # %entry 2468; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] 2469; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 2470; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 2471; AVX2-NEXT: retq 2472; 2473; AVX512-LABEL: test27: 2474; AVX512: # %bb.0: # %entry 2475; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2476; AVX512-NEXT: retq 2477entry: 2478 %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154> 2479 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154> 2480 %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154> 2481 ret <64 x i8> %2 2482} 2483 2484define <64 x i8> @test28(<64 x i8> %x) { 2485; SSE-LABEL: test28: 2486; SSE: # %bb.0: # %entry 2487; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70] 2488; SSE-NEXT: psubusb %xmm4, %xmm0 2489; SSE-NEXT: psubusb %xmm4, %xmm2 2490; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2491; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2492; SSE-NEXT: retq 2493; 2494; AVX1-LABEL: test28: 2495; AVX1: # %bb.0: # %entry 2496; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70] 2497; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3 2498; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2499; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2500; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 2501; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2 2502; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2503; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2504; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2505; AVX1-NEXT: retq 2506; 2507; AVX2-LABEL: test28: 2508; AVX2: # %bb.0: # %entry 2509; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2510; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2511; AVX2-NEXT: retq 2512; 2513; AVX512-LABEL: test28: 2514; AVX512: # %bb.0: # %entry 2515; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2516; AVX512-NEXT: retq 2517entry: 2518 %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70> 2519 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70> 2520 %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70> 2521 ret <64 x i8> %2 2522} 2523 2524define <32 x i16> @test29(<32 x i16> %x) { 2525; SSE-LABEL: test29: 2526; SSE: # %bb.0: # %entry 2527; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2528; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2529; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2530; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2531; SSE-NEXT: retq 2532; 2533; AVX1-LABEL: test29: 2534; AVX1: # %bb.0: # %entry 2535; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2536; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2537; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2538; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2539; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 2540; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2541; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2542; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2543; AVX1-NEXT: retq 2544; 2545; AVX2-LABEL: test29: 2546; AVX2: # %bb.0: # %entry 2547; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2548; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2549; AVX2-NEXT: retq 2550; 2551; AVX512-LABEL: test29: 2552; AVX512: # %bb.0: # %entry 2553; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2554; AVX512-NEXT: retq 2555entry: 2556 %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70> 2557 %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70> 2558 %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70> 2559 ret <32 x i16> %2 2560} 2561 2562; PR40083 2563define i64 @test30(<8 x i16> %x) { 2564; SSE-LABEL: test30: 2565; SSE: # %bb.0: # %entry 2566; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2567; SSE-NEXT: movq %xmm0, %rax 2568; SSE-NEXT: retq 2569; 2570; AVX-LABEL: test30: 2571; AVX: # %bb.0: # %entry 2572; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2573; AVX-NEXT: vmovq %xmm0, %rax 2574; AVX-NEXT: retq 2575entry: 2576 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef> 2577 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef> 2578 %2 = add <8 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 undef, i16 undef, i16 undef, i16 undef> 2579 %3 = bitcast <8 x i16> %2 to <2 x i64> 2580 %4 = extractelement <2 x i64> %3, i32 0 2581 ret i64 %4 2582} 2583 2584; PR40083 2585define i64 @test31(<2 x i64> %x) { 2586; SSE-LABEL: test31: 2587; SSE: # %bb.0: 2588; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2589; SSE-NEXT: movq %xmm0, %rax 2590; SSE-NEXT: retq 2591; 2592; AVX-LABEL: test31: 2593; AVX: # %bb.0: 2594; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2595; AVX-NEXT: vmovq %xmm0, %rax 2596; AVX-NEXT: retq 2597 %t0 = bitcast <2 x i64> %x to <16 x i8> 2598 %cmp = icmp ugt <16 x i8> %t0, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> 2599 %bop = add <16 x i8> %t0, <i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> 2600 %sel = select <16 x i1> %cmp, <16 x i8> %bop, <16 x i8> zeroinitializer 2601 %bc = bitcast <16 x i8> %sel to <2 x i64> 2602 %ext = extractelement <2 x i64> %bc, i32 0 2603 ret i64 %ext 2604} 2605 2606; v8i16/v8i32 - sub(x,trunc(umin(zext(x),y))) 2607define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { 2608; SSE2-LABEL: test32: 2609; SSE2: # %bb.0: 2610; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 2611; SSE2-NEXT: movdqa %xmm2, %xmm4 2612; SSE2-NEXT: pxor %xmm3, %xmm4 2613; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 2614; SSE2-NEXT: movdqa %xmm5, %xmm6 2615; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 2616; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 2617; SSE2-NEXT: pand %xmm6, %xmm2 2618; SSE2-NEXT: pxor %xmm4, %xmm6 2619; SSE2-NEXT: por %xmm2, %xmm6 2620; SSE2-NEXT: pslld $16, %xmm6 2621; SSE2-NEXT: psrad $16, %xmm6 2622; SSE2-NEXT: pxor %xmm1, %xmm3 2623; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 2624; SSE2-NEXT: pxor %xmm5, %xmm4 2625; SSE2-NEXT: pand %xmm1, %xmm5 2626; SSE2-NEXT: por %xmm4, %xmm5 2627; SSE2-NEXT: pslld $16, %xmm5 2628; SSE2-NEXT: psrad $16, %xmm5 2629; SSE2-NEXT: packssdw %xmm6, %xmm5 2630; SSE2-NEXT: psubusw %xmm5, %xmm0 2631; SSE2-NEXT: retq 2632; 2633; SSSE3-LABEL: test32: 2634; SSSE3: # %bb.0: 2635; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 2636; SSSE3-NEXT: movdqa %xmm2, %xmm4 2637; SSSE3-NEXT: pxor %xmm3, %xmm4 2638; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] 2639; SSSE3-NEXT: movdqa %xmm5, %xmm6 2640; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 2641; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 2642; SSSE3-NEXT: pand %xmm6, %xmm2 2643; SSSE3-NEXT: pxor %xmm4, %xmm6 2644; SSSE3-NEXT: por %xmm2, %xmm6 2645; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2646; SSSE3-NEXT: pshufb %xmm2, %xmm6 2647; SSSE3-NEXT: pxor %xmm1, %xmm3 2648; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 2649; SSSE3-NEXT: pxor %xmm5, %xmm4 2650; SSSE3-NEXT: pand %xmm1, %xmm5 2651; SSSE3-NEXT: por %xmm4, %xmm5 2652; SSSE3-NEXT: pshufb %xmm2, %xmm5 2653; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 2654; SSSE3-NEXT: psubusw %xmm5, %xmm0 2655; SSSE3-NEXT: retq 2656; 2657; SSE41-LABEL: test32: 2658; SSE41: # %bb.0: 2659; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 2660; SSE41-NEXT: pminud %xmm3, %xmm2 2661; SSE41-NEXT: pminud %xmm3, %xmm1 2662; SSE41-NEXT: packusdw %xmm2, %xmm1 2663; SSE41-NEXT: psubusw %xmm1, %xmm0 2664; SSE41-NEXT: retq 2665; 2666; AVX1-LABEL: test32: 2667; AVX1: # %bb.0: 2668; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2669; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] 2670; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 2671; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 2672; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2673; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2674; AVX1-NEXT: vzeroupper 2675; AVX1-NEXT: retq 2676; 2677; AVX2-LABEL: test32: 2678; AVX2: # %bb.0: 2679; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 2680; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 2681; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2682; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2683; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2684; AVX2-NEXT: vzeroupper 2685; AVX2-NEXT: retq 2686; 2687; AVX512-LABEL: test32: 2688; AVX512: # %bb.0: 2689; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 2690; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2691; AVX512-NEXT: vzeroupper 2692; AVX512-NEXT: retq 2693 %zext = zext <8 x i16> %a0 to <8 x i32> 2694 %icmp = icmp ult <8 x i32> %zext, %a1 2695 %umin = select <8 x i1> %icmp, <8 x i32> %zext, <8 x i32> %a1 2696 %trunc = trunc <8 x i32> %umin to <8 x i16> 2697 %sub = sub <8 x i16> %a0, %trunc 2698 ret <8 x i16> %sub 2699} 2700 2701; v8i32/v8i64 - sub(x,trunc(umin(y,zext(x)))) 2702define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { 2703; SSE2OR3-LABEL: test33: 2704; SSE2OR3: # %bb.0: 2705; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] 2706; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 2707; SSE2OR3-NEXT: pxor %xmm6, %xmm8 2708; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] 2709; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] 2710; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 2711; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 2712; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 2713; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 2714; SSE2OR3-NEXT: pand %xmm10, %xmm9 2715; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 2716; SSE2OR3-NEXT: pand %xmm9, %xmm3 2717; SSE2OR3-NEXT: pxor %xmm8, %xmm9 2718; SSE2OR3-NEXT: por %xmm3, %xmm9 2719; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 2720; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2721; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] 2722; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 2723; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 2724; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2725; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 2726; SSE2OR3-NEXT: pand %xmm11, %xmm3 2727; SSE2OR3-NEXT: pand %xmm3, %xmm2 2728; SSE2OR3-NEXT: pxor %xmm8, %xmm3 2729; SSE2OR3-NEXT: por %xmm2, %xmm3 2730; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] 2731; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 2732; SSE2OR3-NEXT: psubd %xmm3, %xmm2 2733; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2734; SSE2OR3-NEXT: pxor %xmm6, %xmm0 2735; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm0 2736; SSE2OR3-NEXT: pand %xmm2, %xmm0 2737; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 2738; SSE2OR3-NEXT: pxor %xmm6, %xmm2 2739; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] 2740; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 2741; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 2742; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2743; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 2744; SSE2OR3-NEXT: pand %xmm9, %xmm2 2745; SSE2OR3-NEXT: pand %xmm2, %xmm5 2746; SSE2OR3-NEXT: pxor %xmm8, %xmm2 2747; SSE2OR3-NEXT: por %xmm5, %xmm2 2748; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 2749; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2750; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] 2751; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 2752; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2753; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 2754; SSE2OR3-NEXT: pand %xmm7, %xmm3 2755; SSE2OR3-NEXT: pxor %xmm3, %xmm8 2756; SSE2OR3-NEXT: pand %xmm4, %xmm3 2757; SSE2OR3-NEXT: por %xmm8, %xmm3 2758; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] 2759; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 2760; SSE2OR3-NEXT: psubd %xmm3, %xmm2 2761; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2762; SSE2OR3-NEXT: pxor %xmm6, %xmm1 2763; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm1 2764; SSE2OR3-NEXT: pand %xmm2, %xmm1 2765; SSE2OR3-NEXT: retq 2766; 2767; SSE41-LABEL: test33: 2768; SSE41: # %bb.0: 2769; SSE41-NEXT: movdqa %xmm0, %xmm7 2770; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] 2771; SSE41-NEXT: movdqa %xmm3, %xmm9 2772; SSE41-NEXT: pxor %xmm10, %xmm9 2773; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] 2774; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] 2775; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 2776; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] 2777; SSE41-NEXT: movdqa %xmm6, %xmm0 2778; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 2779; SSE41-NEXT: pand %xmm9, %xmm0 2780; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] 2781; SSE41-NEXT: movapd %xmm9, %xmm11 2782; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 2783; SSE41-NEXT: movdqa %xmm2, %xmm3 2784; SSE41-NEXT: pxor %xmm10, %xmm3 2785; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] 2786; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 2787; SSE41-NEXT: movdqa %xmm6, %xmm0 2788; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 2789; SSE41-NEXT: pand %xmm3, %xmm0 2790; SSE41-NEXT: movapd %xmm9, %xmm3 2791; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 2792; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] 2793; SSE41-NEXT: pmaxud %xmm3, %xmm7 2794; SSE41-NEXT: psubd %xmm3, %xmm7 2795; SSE41-NEXT: movdqa %xmm5, %xmm2 2796; SSE41-NEXT: pxor %xmm10, %xmm2 2797; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] 2798; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 2799; SSE41-NEXT: movdqa %xmm6, %xmm0 2800; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 2801; SSE41-NEXT: pand %xmm2, %xmm0 2802; SSE41-NEXT: movapd %xmm9, %xmm2 2803; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 2804; SSE41-NEXT: pxor %xmm4, %xmm10 2805; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 2806; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] 2807; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 2808; SSE41-NEXT: pand %xmm8, %xmm6 2809; SSE41-NEXT: movdqa %xmm6, %xmm0 2810; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 2811; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] 2812; SSE41-NEXT: pmaxud %xmm9, %xmm1 2813; SSE41-NEXT: psubd %xmm9, %xmm1 2814; SSE41-NEXT: movdqa %xmm7, %xmm0 2815; SSE41-NEXT: retq 2816; 2817; AVX1-LABEL: test33: 2818; AVX1: # %bb.0: 2819; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 2820; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 2821; AVX1-NEXT: # xmm4 = mem[0,0] 2822; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 2823; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] 2824; AVX1-NEXT: # xmm6 = mem[0,0] 2825; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 2826; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] 2827; AVX1-NEXT: # xmm7 = mem[0,0] 2828; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 2829; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 2830; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 2831; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 2832; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 2833; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2834; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3 2835; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 2836; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2837; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 2838; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 2839; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 2840; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 2841; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 2842; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1 2843; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] 2844; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 2845; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 2846; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2847; AVX1-NEXT: retq 2848; 2849; AVX2-SLOW-LABEL: test33: 2850; AVX2-SLOW: # %bb.0: 2851; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 2852; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 2853; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 2854; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 2855; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 2856; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 2857; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 2858; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 2859; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 2860; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 2861; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2862; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] 2863; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2864; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2865; AVX2-SLOW-NEXT: retq 2866; 2867; AVX2-FAST-ALL-LABEL: test33: 2868; AVX2-FAST-ALL: # %bb.0: 2869; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 2870; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 2871; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 2872; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 2873; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 2874; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 2875; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 2876; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 2877; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 2878; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 2879; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 2880; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 2881; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2882; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2883; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2884; AVX2-FAST-ALL-NEXT: retq 2885; 2886; AVX2-FAST-PERLANE-LABEL: test33: 2887; AVX2-FAST-PERLANE: # %bb.0: 2888; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 2889; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 2890; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 2891; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 2892; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 2893; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 2894; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 2895; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 2896; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 2897; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 2898; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2899; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] 2900; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2901; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2902; AVX2-FAST-PERLANE-NEXT: retq 2903; 2904; AVX512-LABEL: test33: 2905; AVX512: # %bb.0: 2906; AVX512-NEXT: vpmovusqd %zmm1, %ymm1 2907; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 2908; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2909; AVX512-NEXT: retq 2910 %zext = zext <8 x i32> %a0 to <8 x i64> 2911 %icmp = icmp ult <8 x i64> %a1, %zext 2912 %umin = select <8 x i1> %icmp, <8 x i64> %a1, <8 x i64> %zext 2913 %trunc = trunc <8 x i64> %umin to <8 x i32> 2914 %sub = sub <8 x i32> %a0, %trunc 2915 ret <8 x i32> %sub 2916} 2917 2918; v8i32/v8i64 - sub(x,trunc(umin(zext(and(x,1)),y))) 2919define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { 2920; SSE2OR3-LABEL: test34: 2921; SSE2OR3: # %bb.0: 2922; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] 2923; SSE2OR3-NEXT: pand %xmm6, %xmm1 2924; SSE2OR3-NEXT: pand %xmm6, %xmm0 2925; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] 2926; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 2927; SSE2OR3-NEXT: pxor %xmm6, %xmm8 2928; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] 2929; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] 2930; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 2931; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 2932; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 2933; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 2934; SSE2OR3-NEXT: pand %xmm10, %xmm9 2935; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 2936; SSE2OR3-NEXT: pand %xmm9, %xmm3 2937; SSE2OR3-NEXT: pxor %xmm8, %xmm9 2938; SSE2OR3-NEXT: por %xmm3, %xmm9 2939; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 2940; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2941; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] 2942; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 2943; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 2944; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2945; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 2946; SSE2OR3-NEXT: pand %xmm11, %xmm3 2947; SSE2OR3-NEXT: pand %xmm3, %xmm2 2948; SSE2OR3-NEXT: pxor %xmm8, %xmm3 2949; SSE2OR3-NEXT: por %xmm2, %xmm3 2950; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] 2951; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 2952; SSE2OR3-NEXT: psubd %xmm3, %xmm2 2953; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2954; SSE2OR3-NEXT: por %xmm6, %xmm0 2955; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm0 2956; SSE2OR3-NEXT: pand %xmm2, %xmm0 2957; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 2958; SSE2OR3-NEXT: pxor %xmm6, %xmm2 2959; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] 2960; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 2961; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 2962; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2963; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 2964; SSE2OR3-NEXT: pand %xmm9, %xmm2 2965; SSE2OR3-NEXT: pand %xmm2, %xmm5 2966; SSE2OR3-NEXT: pxor %xmm8, %xmm2 2967; SSE2OR3-NEXT: por %xmm5, %xmm2 2968; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 2969; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2970; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] 2971; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 2972; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2973; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 2974; SSE2OR3-NEXT: pand %xmm7, %xmm3 2975; SSE2OR3-NEXT: pxor %xmm3, %xmm8 2976; SSE2OR3-NEXT: pand %xmm4, %xmm3 2977; SSE2OR3-NEXT: por %xmm8, %xmm3 2978; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] 2979; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 2980; SSE2OR3-NEXT: psubd %xmm3, %xmm2 2981; SSE2OR3-NEXT: pxor %xmm6, %xmm3 2982; SSE2OR3-NEXT: por %xmm6, %xmm1 2983; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm1 2984; SSE2OR3-NEXT: pand %xmm2, %xmm1 2985; SSE2OR3-NEXT: retq 2986; 2987; SSE41-LABEL: test34: 2988; SSE41: # %bb.0: 2989; SSE41-NEXT: movdqa %xmm0, %xmm6 2990; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] 2991; SSE41-NEXT: pand %xmm0, %xmm1 2992; SSE41-NEXT: pand %xmm0, %xmm6 2993; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] 2994; SSE41-NEXT: movdqa %xmm3, %xmm9 2995; SSE41-NEXT: pxor %xmm10, %xmm9 2996; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] 2997; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] 2998; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 2999; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] 3000; SSE41-NEXT: movdqa %xmm7, %xmm0 3001; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 3002; SSE41-NEXT: pand %xmm9, %xmm0 3003; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] 3004; SSE41-NEXT: movapd %xmm9, %xmm11 3005; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 3006; SSE41-NEXT: movdqa %xmm2, %xmm3 3007; SSE41-NEXT: pxor %xmm10, %xmm3 3008; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] 3009; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 3010; SSE41-NEXT: movdqa %xmm7, %xmm0 3011; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 3012; SSE41-NEXT: pand %xmm3, %xmm0 3013; SSE41-NEXT: movapd %xmm9, %xmm3 3014; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 3015; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] 3016; SSE41-NEXT: pmaxud %xmm3, %xmm6 3017; SSE41-NEXT: psubd %xmm3, %xmm6 3018; SSE41-NEXT: movdqa %xmm5, %xmm2 3019; SSE41-NEXT: pxor %xmm10, %xmm2 3020; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] 3021; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 3022; SSE41-NEXT: movdqa %xmm7, %xmm0 3023; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 3024; SSE41-NEXT: pand %xmm2, %xmm0 3025; SSE41-NEXT: movapd %xmm9, %xmm2 3026; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 3027; SSE41-NEXT: pxor %xmm4, %xmm10 3028; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 3029; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] 3030; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 3031; SSE41-NEXT: pand %xmm8, %xmm7 3032; SSE41-NEXT: movdqa %xmm7, %xmm0 3033; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 3034; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] 3035; SSE41-NEXT: pmaxud %xmm9, %xmm1 3036; SSE41-NEXT: psubd %xmm9, %xmm1 3037; SSE41-NEXT: movdqa %xmm6, %xmm0 3038; SSE41-NEXT: retq 3039; 3040; AVX1-LABEL: test34: 3041; AVX1: # %bb.0: 3042; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3043; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 3044; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 3045; AVX1-NEXT: # xmm4 = mem[0,0] 3046; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 3047; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] 3048; AVX1-NEXT: # xmm6 = mem[0,0] 3049; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 3050; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] 3051; AVX1-NEXT: # xmm7 = mem[0,0] 3052; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 3053; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 3054; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 3055; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 3056; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 3057; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3058; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3 3059; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 3060; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3061; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 3062; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 3063; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 3064; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 3065; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 3066; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1 3067; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] 3068; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 3069; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 3070; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3071; AVX1-NEXT: retq 3072; 3073; AVX2-SLOW-LABEL: test34: 3074; AVX2-SLOW: # %bb.0: 3075; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 3076; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0 3077; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 3078; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 3079; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 3080; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 3081; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 3082; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 3083; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 3084; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 3085; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 3086; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 3087; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3088; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] 3089; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 3090; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3091; AVX2-SLOW-NEXT: retq 3092; 3093; AVX2-FAST-ALL-LABEL: test34: 3094; AVX2-FAST-ALL: # %bb.0: 3095; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 3096; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0 3097; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 3098; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 3099; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 3100; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 3101; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 3102; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 3103; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3104; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 3105; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 3106; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 3107; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 3108; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 3109; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3110; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 3111; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3112; AVX2-FAST-ALL-NEXT: retq 3113; 3114; AVX2-FAST-PERLANE-LABEL: test34: 3115; AVX2-FAST-PERLANE: # %bb.0: 3116; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 3117; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0 3118; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 3119; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 3120; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] 3121; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 3122; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] 3123; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 3124; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 3125; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 3126; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 3127; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] 3128; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3129; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] 3130; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 3131; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3132; AVX2-FAST-PERLANE-NEXT: retq 3133; 3134; AVX512-LABEL: test34: 3135; AVX512: # %bb.0: 3136; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 3137; AVX512-NEXT: vpmovusqd %zmm1, %ymm1 3138; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 3139; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3140; AVX512-NEXT: retq 3141 %mask = and <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3142 %zext = zext <8 x i32> %mask to <8 x i64> 3143 %icmp = icmp ult <8 x i64> %zext, %a1 3144 %umin = select <8 x i1> %icmp, <8 x i64> %zext, <8 x i64> %a1 3145 %trunc = trunc <8 x i64> %umin to <8 x i32> 3146 %sub = sub <8 x i32> %mask, %trunc 3147 ret <8 x i32> %sub 3148} 3149