1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512 9 10declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>) 11declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) 12declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>) 13declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) 14declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>) 15declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>) 16declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>) 17 18declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>) 19declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>) 20declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>) 21 22declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>) 23declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>) 24declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>) 25 26define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind { 27; CHECK-LABEL: usubo_v1i32: 28; CHECK: # %bb.0: 29; CHECK-NEXT: xorl %eax, %eax 30; CHECK-NEXT: subl %esi, %edi 31; CHECK-NEXT: sbbl %eax, %eax 32; CHECK-NEXT: movl %edi, (%rdx) 33; CHECK-NEXT: retq 34 %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 35 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 36 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 37 %res = sext <1 x i1> %obit to <1 x i32> 38 store <1 x i32> %val, ptr %p2 39 ret <1 x i32> %res 40} 41 42define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { 43; SSE2-LABEL: usubo_v2i32: 44; SSE2: # %bb.0: 45; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 46; SSE2-NEXT: movdqa %xmm0, %xmm3 47; SSE2-NEXT: pxor %xmm2, %xmm3 48; SSE2-NEXT: psubd %xmm1, %xmm0 49; SSE2-NEXT: pxor %xmm0, %xmm2 50; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 51; SSE2-NEXT: movq %xmm0, (%rdi) 52; SSE2-NEXT: movdqa %xmm2, %xmm0 53; SSE2-NEXT: retq 54; 55; SSSE3-LABEL: usubo_v2i32: 56; SSSE3: # %bb.0: 57; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 58; SSSE3-NEXT: movdqa %xmm0, %xmm3 59; SSSE3-NEXT: pxor %xmm2, %xmm3 60; SSSE3-NEXT: psubd %xmm1, %xmm0 61; SSSE3-NEXT: pxor %xmm0, %xmm2 62; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 63; SSSE3-NEXT: movq %xmm0, (%rdi) 64; SSSE3-NEXT: movdqa %xmm2, %xmm0 65; SSSE3-NEXT: retq 66; 67; SSE41-LABEL: usubo_v2i32: 68; SSE41: # %bb.0: 69; SSE41-NEXT: movdqa %xmm0, %xmm2 70; SSE41-NEXT: psubd %xmm1, %xmm2 71; SSE41-NEXT: pminud %xmm2, %xmm0 72; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 73; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 74; SSE41-NEXT: pxor %xmm1, %xmm0 75; SSE41-NEXT: movq %xmm2, (%rdi) 76; SSE41-NEXT: retq 77; 78; AVX-LABEL: usubo_v2i32: 79; AVX: # %bb.0: 80; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 81; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 82; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 83; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 84; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 85; AVX-NEXT: vmovq %xmm1, (%rdi) 86; AVX-NEXT: retq 87; 88; AVX512-LABEL: usubo_v2i32: 89; AVX512: # %bb.0: 90; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 91; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 92; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 93; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 94; AVX512-NEXT: vmovq %xmm1, (%rdi) 95; AVX512-NEXT: retq 96 %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 97 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 98 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 99 %res = sext <2 x i1> %obit to <2 x i32> 100 store <2 x i32> %val, ptr %p2 101 ret <2 x i32> %res 102} 103 104define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { 105; SSE2-LABEL: usubo_v3i32: 106; SSE2: # %bb.0: 107; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 108; SSE2-NEXT: movdqa %xmm0, %xmm3 109; SSE2-NEXT: pxor %xmm2, %xmm3 110; SSE2-NEXT: psubd %xmm1, %xmm0 111; SSE2-NEXT: pxor %xmm0, %xmm2 112; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 113; SSE2-NEXT: movq %xmm0, (%rdi) 114; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 115; SSE2-NEXT: movd %xmm0, 8(%rdi) 116; SSE2-NEXT: movdqa %xmm2, %xmm0 117; SSE2-NEXT: retq 118; 119; SSSE3-LABEL: usubo_v3i32: 120; SSSE3: # %bb.0: 121; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 122; SSSE3-NEXT: movdqa %xmm0, %xmm3 123; SSSE3-NEXT: pxor %xmm2, %xmm3 124; SSSE3-NEXT: psubd %xmm1, %xmm0 125; SSSE3-NEXT: pxor %xmm0, %xmm2 126; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 127; SSSE3-NEXT: movq %xmm0, (%rdi) 128; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 129; SSSE3-NEXT: movd %xmm0, 8(%rdi) 130; SSSE3-NEXT: movdqa %xmm2, %xmm0 131; SSSE3-NEXT: retq 132; 133; SSE41-LABEL: usubo_v3i32: 134; SSE41: # %bb.0: 135; SSE41-NEXT: movdqa %xmm0, %xmm2 136; SSE41-NEXT: psubd %xmm1, %xmm2 137; SSE41-NEXT: pminud %xmm2, %xmm0 138; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 139; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 140; SSE41-NEXT: pxor %xmm1, %xmm0 141; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi) 142; SSE41-NEXT: movq %xmm2, (%rdi) 143; SSE41-NEXT: retq 144; 145; AVX-LABEL: usubo_v3i32: 146; AVX: # %bb.0: 147; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 148; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 149; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 150; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 151; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 152; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) 153; AVX-NEXT: vmovq %xmm1, (%rdi) 154; AVX-NEXT: retq 155; 156; AVX512-LABEL: usubo_v3i32: 157; AVX512: # %bb.0: 158; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 159; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 160; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 161; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 162; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 163; AVX512-NEXT: vmovq %xmm1, (%rdi) 164; AVX512-NEXT: retq 165 %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 166 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 167 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 168 %res = sext <3 x i1> %obit to <3 x i32> 169 store <3 x i32> %val, ptr %p2 170 ret <3 x i32> %res 171} 172 173define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { 174; SSE2-LABEL: usubo_v4i32: 175; SSE2: # %bb.0: 176; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 177; SSE2-NEXT: movdqa %xmm0, %xmm3 178; SSE2-NEXT: pxor %xmm2, %xmm3 179; SSE2-NEXT: psubd %xmm1, %xmm0 180; SSE2-NEXT: pxor %xmm0, %xmm2 181; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 182; SSE2-NEXT: movdqa %xmm0, (%rdi) 183; SSE2-NEXT: movdqa %xmm2, %xmm0 184; SSE2-NEXT: retq 185; 186; SSSE3-LABEL: usubo_v4i32: 187; SSSE3: # %bb.0: 188; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 189; SSSE3-NEXT: movdqa %xmm0, %xmm3 190; SSSE3-NEXT: pxor %xmm2, %xmm3 191; SSSE3-NEXT: psubd %xmm1, %xmm0 192; SSSE3-NEXT: pxor %xmm0, %xmm2 193; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 194; SSSE3-NEXT: movdqa %xmm0, (%rdi) 195; SSSE3-NEXT: movdqa %xmm2, %xmm0 196; SSSE3-NEXT: retq 197; 198; SSE41-LABEL: usubo_v4i32: 199; SSE41: # %bb.0: 200; SSE41-NEXT: movdqa %xmm0, %xmm2 201; SSE41-NEXT: psubd %xmm1, %xmm2 202; SSE41-NEXT: pminud %xmm2, %xmm0 203; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 204; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 205; SSE41-NEXT: pxor %xmm1, %xmm0 206; SSE41-NEXT: movdqa %xmm2, (%rdi) 207; SSE41-NEXT: retq 208; 209; AVX-LABEL: usubo_v4i32: 210; AVX: # %bb.0: 211; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 212; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 213; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 214; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 215; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 216; AVX-NEXT: vmovdqa %xmm1, (%rdi) 217; AVX-NEXT: retq 218; 219; AVX512-LABEL: usubo_v4i32: 220; AVX512: # %bb.0: 221; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 222; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 223; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 224; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 225; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 226; AVX512-NEXT: retq 227 %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 228 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 229 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 230 %res = sext <4 x i1> %obit to <4 x i32> 231 store <4 x i32> %val, ptr %p2 232 ret <4 x i32> %res 233} 234 235define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { 236; SSE2-LABEL: usubo_v6i32: 237; SSE2: # %bb.0: 238; SSE2-NEXT: movq %rdi, %rax 239; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 240; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 241; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 242; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 243; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 244; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 245; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 246; SSE2-NEXT: movd %r8d, %xmm0 247; SSE2-NEXT: movd %ecx, %xmm1 248; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 249; SSE2-NEXT: movd %edx, %xmm2 250; SSE2-NEXT: movd %esi, %xmm0 251; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 252; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 253; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 254; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 255; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 256; SSE2-NEXT: movd %r9d, %xmm1 257; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 258; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 259; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 260; SSE2-NEXT: movdqa %xmm0, %xmm4 261; SSE2-NEXT: psubd %xmm3, %xmm4 262; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 263; SSE2-NEXT: movdqa %xmm4, (%rcx) 264; SSE2-NEXT: pxor %xmm3, %xmm4 265; SSE2-NEXT: pxor %xmm3, %xmm0 266; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 267; SSE2-NEXT: movdqa %xmm1, %xmm0 268; SSE2-NEXT: psubd %xmm2, %xmm0 269; SSE2-NEXT: movq %xmm0, 16(%rcx) 270; SSE2-NEXT: pxor %xmm3, %xmm0 271; SSE2-NEXT: pxor %xmm3, %xmm1 272; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 273; SSE2-NEXT: movq %xmm0, 16(%rdi) 274; SSE2-NEXT: movdqa %xmm4, (%rdi) 275; SSE2-NEXT: retq 276; 277; SSSE3-LABEL: usubo_v6i32: 278; SSSE3: # %bb.0: 279; SSSE3-NEXT: movq %rdi, %rax 280; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 281; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 282; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 283; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 284; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 285; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 286; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 287; SSSE3-NEXT: movd %r8d, %xmm0 288; SSSE3-NEXT: movd %ecx, %xmm1 289; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 290; SSSE3-NEXT: movd %edx, %xmm2 291; SSSE3-NEXT: movd %esi, %xmm0 292; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 293; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 294; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 295; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 296; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 297; SSSE3-NEXT: movd %r9d, %xmm1 298; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 299; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 300; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 301; SSSE3-NEXT: movdqa %xmm0, %xmm4 302; SSSE3-NEXT: psubd %xmm3, %xmm4 303; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 304; SSSE3-NEXT: movdqa %xmm4, (%rcx) 305; SSSE3-NEXT: pxor %xmm3, %xmm4 306; SSSE3-NEXT: pxor %xmm3, %xmm0 307; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 308; SSSE3-NEXT: movdqa %xmm1, %xmm0 309; SSSE3-NEXT: psubd %xmm2, %xmm0 310; SSSE3-NEXT: movq %xmm0, 16(%rcx) 311; SSSE3-NEXT: pxor %xmm3, %xmm0 312; SSSE3-NEXT: pxor %xmm3, %xmm1 313; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 314; SSSE3-NEXT: movq %xmm0, 16(%rdi) 315; SSSE3-NEXT: movdqa %xmm4, (%rdi) 316; SSSE3-NEXT: retq 317; 318; SSE41-LABEL: usubo_v6i32: 319; SSE41: # %bb.0: 320; SSE41-NEXT: movq %rdi, %rax 321; SSE41-NEXT: movd %esi, %xmm0 322; SSE41-NEXT: pinsrd $1, %edx, %xmm0 323; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 324; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 325; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 326; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 327; SSE41-NEXT: movd %r9d, %xmm2 328; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 329; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 330; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 331; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 332; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 333; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 334; SSE41-NEXT: movdqa %xmm0, %xmm4 335; SSE41-NEXT: psubd %xmm3, %xmm4 336; SSE41-NEXT: pminud %xmm4, %xmm0 337; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 338; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 339; SSE41-NEXT: pxor %xmm3, %xmm0 340; SSE41-NEXT: movdqa %xmm2, %xmm5 341; SSE41-NEXT: psubd %xmm1, %xmm5 342; SSE41-NEXT: pminud %xmm5, %xmm2 343; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 344; SSE41-NEXT: pxor %xmm3, %xmm2 345; SSE41-NEXT: movq %xmm5, 16(%rcx) 346; SSE41-NEXT: movdqa %xmm4, (%rcx) 347; SSE41-NEXT: movq %xmm2, 16(%rdi) 348; SSE41-NEXT: movdqa %xmm0, (%rdi) 349; SSE41-NEXT: retq 350; 351; AVX1-LABEL: usubo_v6i32: 352; AVX1: # %bb.0: 353; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 354; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 355; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 356; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 357; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 358; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 359; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 360; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 361; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 362; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 363; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 364; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 365; AVX1-NEXT: vmovq %xmm2, 16(%rdi) 366; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 367; AVX1-NEXT: retq 368; 369; AVX2-LABEL: usubo_v6i32: 370; AVX2: # %bb.0: 371; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 372; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0 373; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 374; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 375; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 376; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 377; AVX2-NEXT: vmovq %xmm2, 16(%rdi) 378; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 379; AVX2-NEXT: retq 380; 381; AVX512-LABEL: usubo_v6i32: 382; AVX512: # %bb.0: 383; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 384; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1 385; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 386; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 387; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 388; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 389; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 390; AVX512-NEXT: retq 391 %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 392 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 393 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 394 %res = sext <6 x i1> %obit to <6 x i32> 395 store <6 x i32> %val, ptr %p2 396 ret <6 x i32> %res 397} 398 399define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { 400; SSE2-LABEL: usubo_v8i32: 401; SSE2: # %bb.0: 402; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 403; SSE2-NEXT: movdqa %xmm0, %xmm5 404; SSE2-NEXT: pxor %xmm4, %xmm5 405; SSE2-NEXT: psubd %xmm2, %xmm0 406; SSE2-NEXT: movdqa %xmm0, (%rdi) 407; SSE2-NEXT: pxor %xmm4, %xmm0 408; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 409; SSE2-NEXT: movdqa %xmm1, %xmm2 410; SSE2-NEXT: pxor %xmm4, %xmm2 411; SSE2-NEXT: psubd %xmm3, %xmm1 412; SSE2-NEXT: pxor %xmm1, %xmm4 413; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 414; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 415; SSE2-NEXT: movdqa %xmm4, %xmm1 416; SSE2-NEXT: retq 417; 418; SSSE3-LABEL: usubo_v8i32: 419; SSSE3: # %bb.0: 420; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 421; SSSE3-NEXT: movdqa %xmm0, %xmm5 422; SSSE3-NEXT: pxor %xmm4, %xmm5 423; SSSE3-NEXT: psubd %xmm2, %xmm0 424; SSSE3-NEXT: movdqa %xmm0, (%rdi) 425; SSSE3-NEXT: pxor %xmm4, %xmm0 426; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 427; SSSE3-NEXT: movdqa %xmm1, %xmm2 428; SSSE3-NEXT: pxor %xmm4, %xmm2 429; SSSE3-NEXT: psubd %xmm3, %xmm1 430; SSSE3-NEXT: pxor %xmm1, %xmm4 431; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 432; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 433; SSSE3-NEXT: movdqa %xmm4, %xmm1 434; SSSE3-NEXT: retq 435; 436; SSE41-LABEL: usubo_v8i32: 437; SSE41: # %bb.0: 438; SSE41-NEXT: movdqa %xmm0, %xmm4 439; SSE41-NEXT: psubd %xmm2, %xmm4 440; SSE41-NEXT: pminud %xmm4, %xmm0 441; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 442; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 443; SSE41-NEXT: pxor %xmm2, %xmm0 444; SSE41-NEXT: movdqa %xmm1, %xmm5 445; SSE41-NEXT: psubd %xmm3, %xmm5 446; SSE41-NEXT: pminud %xmm5, %xmm1 447; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 448; SSE41-NEXT: pxor %xmm2, %xmm1 449; SSE41-NEXT: movdqa %xmm5, 16(%rdi) 450; SSE41-NEXT: movdqa %xmm4, (%rdi) 451; SSE41-NEXT: retq 452; 453; AVX1-LABEL: usubo_v8i32: 454; AVX1: # %bb.0: 455; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 456; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 457; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 458; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 459; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 460; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 461; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 462; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 463; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 464; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 465; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 466; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 467; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) 468; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 469; AVX1-NEXT: retq 470; 471; AVX2-LABEL: usubo_v8i32: 472; AVX2: # %bb.0: 473; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 474; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0 475; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 476; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 477; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 478; AVX2-NEXT: vmovdqa %ymm1, (%rdi) 479; AVX2-NEXT: retq 480; 481; AVX512-LABEL: usubo_v8i32: 482; AVX512: # %bb.0: 483; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 484; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1 485; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 486; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 487; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 488; AVX512-NEXT: retq 489 %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 490 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 491 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 492 %res = sext <8 x i1> %obit to <8 x i32> 493 store <8 x i32> %val, ptr %p2 494 ret <8 x i32> %res 495} 496 497define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { 498; SSE2-LABEL: usubo_v16i32: 499; SSE2: # %bb.0: 500; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] 501; SSE2-NEXT: movdqa %xmm0, %xmm9 502; SSE2-NEXT: pxor %xmm8, %xmm9 503; SSE2-NEXT: psubd %xmm4, %xmm0 504; SSE2-NEXT: movdqa %xmm0, (%rdi) 505; SSE2-NEXT: pxor %xmm8, %xmm0 506; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 507; SSE2-NEXT: movdqa %xmm1, %xmm4 508; SSE2-NEXT: pxor %xmm8, %xmm4 509; SSE2-NEXT: psubd %xmm5, %xmm1 510; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 511; SSE2-NEXT: pxor %xmm8, %xmm1 512; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 513; SSE2-NEXT: movdqa %xmm2, %xmm4 514; SSE2-NEXT: pxor %xmm8, %xmm4 515; SSE2-NEXT: psubd %xmm6, %xmm2 516; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 517; SSE2-NEXT: pxor %xmm8, %xmm2 518; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 519; SSE2-NEXT: movdqa %xmm3, %xmm4 520; SSE2-NEXT: pxor %xmm8, %xmm4 521; SSE2-NEXT: psubd %xmm7, %xmm3 522; SSE2-NEXT: pxor %xmm3, %xmm8 523; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 524; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 525; SSE2-NEXT: movdqa %xmm8, %xmm3 526; SSE2-NEXT: retq 527; 528; SSSE3-LABEL: usubo_v16i32: 529; SSSE3: # %bb.0: 530; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] 531; SSSE3-NEXT: movdqa %xmm0, %xmm9 532; SSSE3-NEXT: pxor %xmm8, %xmm9 533; SSSE3-NEXT: psubd %xmm4, %xmm0 534; SSSE3-NEXT: movdqa %xmm0, (%rdi) 535; SSSE3-NEXT: pxor %xmm8, %xmm0 536; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 537; SSSE3-NEXT: movdqa %xmm1, %xmm4 538; SSSE3-NEXT: pxor %xmm8, %xmm4 539; SSSE3-NEXT: psubd %xmm5, %xmm1 540; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 541; SSSE3-NEXT: pxor %xmm8, %xmm1 542; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 543; SSSE3-NEXT: movdqa %xmm2, %xmm4 544; SSSE3-NEXT: pxor %xmm8, %xmm4 545; SSSE3-NEXT: psubd %xmm6, %xmm2 546; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 547; SSSE3-NEXT: pxor %xmm8, %xmm2 548; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 549; SSSE3-NEXT: movdqa %xmm3, %xmm4 550; SSSE3-NEXT: pxor %xmm8, %xmm4 551; SSSE3-NEXT: psubd %xmm7, %xmm3 552; SSSE3-NEXT: pxor %xmm3, %xmm8 553; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 554; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 555; SSSE3-NEXT: movdqa %xmm8, %xmm3 556; SSSE3-NEXT: retq 557; 558; SSE41-LABEL: usubo_v16i32: 559; SSE41: # %bb.0: 560; SSE41-NEXT: movdqa %xmm0, %xmm8 561; SSE41-NEXT: psubd %xmm4, %xmm8 562; SSE41-NEXT: pminud %xmm8, %xmm0 563; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 564; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 565; SSE41-NEXT: pxor %xmm4, %xmm0 566; SSE41-NEXT: movdqa %xmm1, %xmm9 567; SSE41-NEXT: psubd %xmm5, %xmm9 568; SSE41-NEXT: pminud %xmm9, %xmm1 569; SSE41-NEXT: pcmpeqd %xmm9, %xmm1 570; SSE41-NEXT: pxor %xmm4, %xmm1 571; SSE41-NEXT: movdqa %xmm2, %xmm5 572; SSE41-NEXT: psubd %xmm6, %xmm5 573; SSE41-NEXT: pminud %xmm5, %xmm2 574; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 575; SSE41-NEXT: pxor %xmm4, %xmm2 576; SSE41-NEXT: movdqa %xmm3, %xmm6 577; SSE41-NEXT: psubd %xmm7, %xmm6 578; SSE41-NEXT: pminud %xmm6, %xmm3 579; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 580; SSE41-NEXT: pxor %xmm4, %xmm3 581; SSE41-NEXT: movdqa %xmm6, 48(%rdi) 582; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 583; SSE41-NEXT: movdqa %xmm9, 16(%rdi) 584; SSE41-NEXT: movdqa %xmm8, (%rdi) 585; SSE41-NEXT: retq 586; 587; AVX1-LABEL: usubo_v16i32: 588; AVX1: # %bb.0: 589; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 590; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 591; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 592; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm5 593; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 594; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 595; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 596; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 597; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 598; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 599; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 600; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5 601; AVX1-NEXT: vpminud %xmm6, %xmm5, %xmm6 602; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 603; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 604; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 605; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 606; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 607; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 608; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 609; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 610; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 611; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 612; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 613; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 614; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 615; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 616; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 617; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 618; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 619; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 620; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) 621; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 622; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) 623; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 624; AVX1-NEXT: retq 625; 626; AVX2-LABEL: usubo_v16i32: 627; AVX2: # %bb.0: 628; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 629; AVX2-NEXT: vpminud %ymm1, %ymm3, %ymm1 630; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1 631; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 632; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1 633; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 634; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 635; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 636; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0 637; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 638; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 639; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 640; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] 641; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 642; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 643; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 644; AVX2-NEXT: retq 645; 646; AVX512-LABEL: usubo_v16i32: 647; AVX512: # %bb.0: 648; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 649; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1 650; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 651; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 652; AVX512-NEXT: retq 653 %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 654 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 655 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 656 %res = sext <16 x i1> %obit to <16 x i32> 657 store <16 x i32> %val, ptr %p2 658 ret <16 x i32> %res 659} 660 661define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { 662; SSE2-LABEL: usubo_v16i8: 663; SSE2: # %bb.0: 664; SSE2-NEXT: movdqa %xmm0, %xmm4 665; SSE2-NEXT: psubb %xmm1, %xmm4 666; SSE2-NEXT: pminub %xmm4, %xmm0 667; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 668; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 669; SSE2-NEXT: pxor %xmm0, %xmm3 670; SSE2-NEXT: movdqa %xmm3, %xmm0 671; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 672; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 673; SSE2-NEXT: movdqa %xmm3, %xmm1 674; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 675; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 676; SSE2-NEXT: pslld $31, %xmm1 677; SSE2-NEXT: psrad $31, %xmm1 678; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 679; SSE2-NEXT: movdqa %xmm3, %xmm2 680; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 681; SSE2-NEXT: pslld $31, %xmm2 682; SSE2-NEXT: psrad $31, %xmm2 683; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 684; SSE2-NEXT: pslld $31, %xmm3 685; SSE2-NEXT: psrad $31, %xmm3 686; SSE2-NEXT: movdqa %xmm4, (%rdi) 687; SSE2-NEXT: retq 688; 689; SSSE3-LABEL: usubo_v16i8: 690; SSSE3: # %bb.0: 691; SSSE3-NEXT: movdqa %xmm0, %xmm4 692; SSSE3-NEXT: psubb %xmm1, %xmm4 693; SSSE3-NEXT: pminub %xmm4, %xmm0 694; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 695; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 696; SSSE3-NEXT: pxor %xmm0, %xmm3 697; SSSE3-NEXT: movdqa %xmm3, %xmm0 698; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 699; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 700; SSSE3-NEXT: movdqa %xmm3, %xmm1 701; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 702; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 703; SSSE3-NEXT: pslld $31, %xmm1 704; SSSE3-NEXT: psrad $31, %xmm1 705; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 706; SSSE3-NEXT: movdqa %xmm3, %xmm2 707; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 708; SSSE3-NEXT: pslld $31, %xmm2 709; SSSE3-NEXT: psrad $31, %xmm2 710; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 711; SSSE3-NEXT: pslld $31, %xmm3 712; SSSE3-NEXT: psrad $31, %xmm3 713; SSSE3-NEXT: movdqa %xmm4, (%rdi) 714; SSSE3-NEXT: retq 715; 716; SSE41-LABEL: usubo_v16i8: 717; SSE41: # %bb.0: 718; SSE41-NEXT: movdqa %xmm0, %xmm4 719; SSE41-NEXT: psubb %xmm1, %xmm4 720; SSE41-NEXT: pminub %xmm4, %xmm0 721; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 722; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 723; SSE41-NEXT: pxor %xmm0, %xmm3 724; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 725; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 726; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 727; SSE41-NEXT: pslld $31, %xmm1 728; SSE41-NEXT: psrad $31, %xmm1 729; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 730; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 731; SSE41-NEXT: pslld $31, %xmm2 732; SSE41-NEXT: psrad $31, %xmm2 733; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 734; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 735; SSE41-NEXT: pslld $31, %xmm3 736; SSE41-NEXT: psrad $31, %xmm3 737; SSE41-NEXT: movdqa %xmm4, (%rdi) 738; SSE41-NEXT: retq 739; 740; AVX1-LABEL: usubo_v16i8: 741; AVX1: # %bb.0: 742; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 743; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0 744; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 745; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 746; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 747; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 748; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] 749; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 750; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 751; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 752; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 753; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 754; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 755; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 756; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 757; AVX1-NEXT: retq 758; 759; AVX2-LABEL: usubo_v16i8: 760; AVX2: # %bb.0: 761; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2 762; AVX2-NEXT: vpminub %xmm0, %xmm2, %xmm0 763; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 764; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 765; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 766; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 767; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 768; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 769; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 770; AVX2-NEXT: retq 771; 772; AVX512-LABEL: usubo_v16i8: 773; AVX512: # %bb.0: 774; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 775; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1 776; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 777; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 778; AVX512-NEXT: retq 779 %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 780 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 781 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 782 %res = sext <16 x i1> %obit to <16 x i32> 783 store <16 x i8> %val, ptr %p2 784 ret <16 x i32> %res 785} 786 787define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind { 788; SSE2-LABEL: usubo_v8i16: 789; SSE2: # %bb.0: 790; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 791; SSE2-NEXT: movdqa %xmm0, %xmm3 792; SSE2-NEXT: pxor %xmm2, %xmm3 793; SSE2-NEXT: psubw %xmm1, %xmm0 794; SSE2-NEXT: pxor %xmm0, %xmm2 795; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 796; SSE2-NEXT: movdqa %xmm2, %xmm1 797; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 798; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 799; SSE2-NEXT: pslld $31, %xmm2 800; SSE2-NEXT: psrad $31, %xmm2 801; SSE2-NEXT: movdqa %xmm0, (%rdi) 802; SSE2-NEXT: movdqa %xmm1, %xmm0 803; SSE2-NEXT: movdqa %xmm2, %xmm1 804; SSE2-NEXT: retq 805; 806; SSSE3-LABEL: usubo_v8i16: 807; SSSE3: # %bb.0: 808; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 809; SSSE3-NEXT: movdqa %xmm0, %xmm3 810; SSSE3-NEXT: pxor %xmm2, %xmm3 811; SSSE3-NEXT: psubw %xmm1, %xmm0 812; SSSE3-NEXT: pxor %xmm0, %xmm2 813; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 814; SSSE3-NEXT: movdqa %xmm2, %xmm1 815; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 816; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 817; SSSE3-NEXT: pslld $31, %xmm2 818; SSSE3-NEXT: psrad $31, %xmm2 819; SSSE3-NEXT: movdqa %xmm0, (%rdi) 820; SSSE3-NEXT: movdqa %xmm1, %xmm0 821; SSSE3-NEXT: movdqa %xmm2, %xmm1 822; SSSE3-NEXT: retq 823; 824; SSE41-LABEL: usubo_v8i16: 825; SSE41: # %bb.0: 826; SSE41-NEXT: movdqa %xmm0, %xmm2 827; SSE41-NEXT: psubw %xmm1, %xmm2 828; SSE41-NEXT: pminuw %xmm2, %xmm0 829; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 830; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 831; SSE41-NEXT: pxor %xmm0, %xmm1 832; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 833; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 834; SSE41-NEXT: pslld $31, %xmm1 835; SSE41-NEXT: psrad $31, %xmm1 836; SSE41-NEXT: movdqa %xmm2, (%rdi) 837; SSE41-NEXT: retq 838; 839; AVX1-LABEL: usubo_v8i16: 840; AVX1: # %bb.0: 841; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1 842; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0 843; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 844; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 845; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 846; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 847; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 848; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 849; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 850; AVX1-NEXT: retq 851; 852; AVX2-LABEL: usubo_v8i16: 853; AVX2: # %bb.0: 854; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1 855; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0 856; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 857; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 858; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 859; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 860; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 861; AVX2-NEXT: retq 862; 863; AVX512-LABEL: usubo_v8i16: 864; AVX512: # %bb.0: 865; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1 866; AVX512-NEXT: vpcmpnleuw %xmm0, %xmm1, %k1 867; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 868; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 869; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 870; AVX512-NEXT: retq 871 %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 872 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 873 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 874 %res = sext <8 x i1> %obit to <8 x i32> 875 store <8 x i16> %val, ptr %p2 876 ret <8 x i32> %res 877} 878 879define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { 880; SSE-LABEL: usubo_v2i64: 881; SSE: # %bb.0: 882; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 883; SSE-NEXT: movdqa %xmm0, %xmm3 884; SSE-NEXT: pxor %xmm2, %xmm3 885; SSE-NEXT: psubq %xmm1, %xmm0 886; SSE-NEXT: pxor %xmm0, %xmm2 887; SSE-NEXT: movdqa %xmm2, %xmm1 888; SSE-NEXT: pcmpeqd %xmm3, %xmm1 889; SSE-NEXT: pcmpgtd %xmm3, %xmm2 890; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 891; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3] 892; SSE-NEXT: pand %xmm3, %xmm4 893; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3] 894; SSE-NEXT: por %xmm4, %xmm1 895; SSE-NEXT: movdqa %xmm0, (%rdi) 896; SSE-NEXT: movdqa %xmm1, %xmm0 897; SSE-NEXT: retq 898; 899; AVX1-LABEL: usubo_v2i64: 900; AVX1: # %bb.0: 901; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 902; AVX1-NEXT: # xmm2 = mem[0,0] 903; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 904; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 905; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 906; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 907; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 908; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 909; AVX1-NEXT: retq 910; 911; AVX2-LABEL: usubo_v2i64: 912; AVX2: # %bb.0: 913; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 914; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 915; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 916; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 917; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 918; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 919; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 920; AVX2-NEXT: retq 921; 922; AVX512-LABEL: usubo_v2i64: 923; AVX512: # %bb.0: 924; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 925; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1 926; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 927; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 928; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 929; AVX512-NEXT: retq 930 %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 931 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 932 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 933 %res = sext <2 x i1> %obit to <2 x i32> 934 store <2 x i64> %val, ptr %p2 935 ret <2 x i32> %res 936} 937 938define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { 939; SSE2-LABEL: usubo_v4i24: 940; SSE2: # %bb.0: 941; SSE2-NEXT: movdqa %xmm0, %xmm2 942; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 943; SSE2-NEXT: pand %xmm3, %xmm1 944; SSE2-NEXT: pand %xmm3, %xmm2 945; SSE2-NEXT: psubd %xmm1, %xmm2 946; SSE2-NEXT: pand %xmm2, %xmm3 947; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 948; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 949; SSE2-NEXT: pxor %xmm3, %xmm0 950; SSE2-NEXT: movd %xmm2, %eax 951; SSE2-NEXT: movw %ax, (%rdi) 952; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 953; SSE2-NEXT: movd %xmm1, %ecx 954; SSE2-NEXT: movw %cx, 9(%rdi) 955; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 956; SSE2-NEXT: movd %xmm1, %edx 957; SSE2-NEXT: movw %dx, 6(%rdi) 958; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 959; SSE2-NEXT: movd %xmm1, %esi 960; SSE2-NEXT: movw %si, 3(%rdi) 961; SSE2-NEXT: shrl $16, %eax 962; SSE2-NEXT: movb %al, 2(%rdi) 963; SSE2-NEXT: shrl $16, %ecx 964; SSE2-NEXT: movb %cl, 11(%rdi) 965; SSE2-NEXT: shrl $16, %edx 966; SSE2-NEXT: movb %dl, 8(%rdi) 967; SSE2-NEXT: shrl $16, %esi 968; SSE2-NEXT: movb %sil, 5(%rdi) 969; SSE2-NEXT: retq 970; 971; SSSE3-LABEL: usubo_v4i24: 972; SSSE3: # %bb.0: 973; SSSE3-NEXT: movdqa %xmm0, %xmm2 974; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 975; SSSE3-NEXT: pand %xmm3, %xmm1 976; SSSE3-NEXT: pand %xmm3, %xmm2 977; SSSE3-NEXT: psubd %xmm1, %xmm2 978; SSSE3-NEXT: pand %xmm2, %xmm3 979; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 980; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 981; SSSE3-NEXT: pxor %xmm3, %xmm0 982; SSSE3-NEXT: movd %xmm2, %eax 983; SSSE3-NEXT: movw %ax, (%rdi) 984; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 985; SSSE3-NEXT: movd %xmm1, %ecx 986; SSSE3-NEXT: movw %cx, 9(%rdi) 987; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 988; SSSE3-NEXT: movd %xmm1, %edx 989; SSSE3-NEXT: movw %dx, 6(%rdi) 990; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 991; SSSE3-NEXT: movd %xmm1, %esi 992; SSSE3-NEXT: movw %si, 3(%rdi) 993; SSSE3-NEXT: shrl $16, %eax 994; SSSE3-NEXT: movb %al, 2(%rdi) 995; SSSE3-NEXT: shrl $16, %ecx 996; SSSE3-NEXT: movb %cl, 11(%rdi) 997; SSSE3-NEXT: shrl $16, %edx 998; SSSE3-NEXT: movb %dl, 8(%rdi) 999; SSSE3-NEXT: shrl $16, %esi 1000; SSSE3-NEXT: movb %sil, 5(%rdi) 1001; SSSE3-NEXT: retq 1002; 1003; SSE41-LABEL: usubo_v4i24: 1004; SSE41: # %bb.0: 1005; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 1006; SSE41-NEXT: pand %xmm2, %xmm1 1007; SSE41-NEXT: pand %xmm2, %xmm0 1008; SSE41-NEXT: psubd %xmm1, %xmm0 1009; SSE41-NEXT: pand %xmm0, %xmm2 1010; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 1011; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1012; SSE41-NEXT: pxor %xmm2, %xmm1 1013; SSE41-NEXT: pextrd $3, %xmm0, %eax 1014; SSE41-NEXT: movw %ax, 9(%rdi) 1015; SSE41-NEXT: pextrd $2, %xmm0, %ecx 1016; SSE41-NEXT: movw %cx, 6(%rdi) 1017; SSE41-NEXT: pextrd $1, %xmm0, %edx 1018; SSE41-NEXT: movw %dx, 3(%rdi) 1019; SSE41-NEXT: movd %xmm0, %esi 1020; SSE41-NEXT: movw %si, (%rdi) 1021; SSE41-NEXT: shrl $16, %eax 1022; SSE41-NEXT: movb %al, 11(%rdi) 1023; SSE41-NEXT: shrl $16, %ecx 1024; SSE41-NEXT: movb %cl, 8(%rdi) 1025; SSE41-NEXT: shrl $16, %edx 1026; SSE41-NEXT: movb %dl, 5(%rdi) 1027; SSE41-NEXT: shrl $16, %esi 1028; SSE41-NEXT: movb %sil, 2(%rdi) 1029; SSE41-NEXT: movdqa %xmm1, %xmm0 1030; SSE41-NEXT: retq 1031; 1032; AVX1-LABEL: usubo_v4i24: 1033; AVX1: # %bb.0: 1034; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 1035; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 1036; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 1037; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 1038; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 1039; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1040; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1041; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1042; AVX1-NEXT: vpextrd $3, %xmm1, %eax 1043; AVX1-NEXT: movw %ax, 9(%rdi) 1044; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 1045; AVX1-NEXT: movw %cx, 6(%rdi) 1046; AVX1-NEXT: vpextrd $1, %xmm1, %edx 1047; AVX1-NEXT: movw %dx, 3(%rdi) 1048; AVX1-NEXT: vmovd %xmm1, %esi 1049; AVX1-NEXT: movw %si, (%rdi) 1050; AVX1-NEXT: shrl $16, %eax 1051; AVX1-NEXT: movb %al, 11(%rdi) 1052; AVX1-NEXT: shrl $16, %ecx 1053; AVX1-NEXT: movb %cl, 8(%rdi) 1054; AVX1-NEXT: shrl $16, %edx 1055; AVX1-NEXT: movb %dl, 5(%rdi) 1056; AVX1-NEXT: shrl $16, %esi 1057; AVX1-NEXT: movb %sil, 2(%rdi) 1058; AVX1-NEXT: retq 1059; 1060; AVX2-LABEL: usubo_v4i24: 1061; AVX2: # %bb.0: 1062; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 1063; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 1064; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1065; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 1066; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 1067; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1068; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1069; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 1070; AVX2-NEXT: vpextrd $3, %xmm1, %eax 1071; AVX2-NEXT: movw %ax, 9(%rdi) 1072; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 1073; AVX2-NEXT: movw %cx, 6(%rdi) 1074; AVX2-NEXT: vpextrd $1, %xmm1, %edx 1075; AVX2-NEXT: movw %dx, 3(%rdi) 1076; AVX2-NEXT: vmovd %xmm1, %esi 1077; AVX2-NEXT: movw %si, (%rdi) 1078; AVX2-NEXT: shrl $16, %eax 1079; AVX2-NEXT: movb %al, 11(%rdi) 1080; AVX2-NEXT: shrl $16, %ecx 1081; AVX2-NEXT: movb %cl, 8(%rdi) 1082; AVX2-NEXT: shrl $16, %edx 1083; AVX2-NEXT: movb %dl, 5(%rdi) 1084; AVX2-NEXT: shrl $16, %esi 1085; AVX2-NEXT: movb %sil, 2(%rdi) 1086; AVX2-NEXT: retq 1087; 1088; AVX512-LABEL: usubo_v4i24: 1089; AVX512: # %bb.0: 1090; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 1091; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 1092; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 1093; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 1094; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 1095; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1096; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 1097; AVX512-NEXT: vpextrd $3, %xmm1, %eax 1098; AVX512-NEXT: movw %ax, 9(%rdi) 1099; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 1100; AVX512-NEXT: movw %cx, 6(%rdi) 1101; AVX512-NEXT: vpextrd $1, %xmm1, %edx 1102; AVX512-NEXT: movw %dx, 3(%rdi) 1103; AVX512-NEXT: vmovd %xmm1, %esi 1104; AVX512-NEXT: movw %si, (%rdi) 1105; AVX512-NEXT: shrl $16, %eax 1106; AVX512-NEXT: movb %al, 11(%rdi) 1107; AVX512-NEXT: shrl $16, %ecx 1108; AVX512-NEXT: movb %cl, 8(%rdi) 1109; AVX512-NEXT: shrl $16, %edx 1110; AVX512-NEXT: movb %dl, 5(%rdi) 1111; AVX512-NEXT: shrl $16, %esi 1112; AVX512-NEXT: movb %sil, 2(%rdi) 1113; AVX512-NEXT: retq 1114 %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 1115 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 1116 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 1117 %res = sext <4 x i1> %obit to <4 x i32> 1118 store <4 x i24> %val, ptr %p2 1119 ret <4 x i32> %res 1120} 1121 1122define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { 1123; SSE-LABEL: usubo_v4i1: 1124; SSE: # %bb.0: 1125; SSE-NEXT: movdqa %xmm0, %xmm2 1126; SSE-NEXT: pxor %xmm1, %xmm2 1127; SSE-NEXT: pslld $31, %xmm2 1128; SSE-NEXT: movmskps %xmm2, %eax 1129; SSE-NEXT: pandn %xmm1, %xmm0 1130; SSE-NEXT: pslld $31, %xmm0 1131; SSE-NEXT: psrad $31, %xmm0 1132; SSE-NEXT: movb %al, (%rdi) 1133; SSE-NEXT: retq 1134; 1135; AVX-LABEL: usubo_v4i1: 1136; AVX: # %bb.0: 1137; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2 1138; AVX-NEXT: vpslld $31, %xmm2, %xmm2 1139; AVX-NEXT: vmovmskps %xmm2, %eax 1140; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 1141; AVX-NEXT: vpslld $31, %xmm0, %xmm0 1142; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 1143; AVX-NEXT: movb %al, (%rdi) 1144; AVX-NEXT: retq 1145; 1146; AVX512-LABEL: usubo_v4i1: 1147; AVX512: # %bb.0: 1148; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2 1149; AVX512-NEXT: vpslld $31, %xmm2, %xmm2 1150; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0 1151; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 1152; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 1153; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 1154; AVX512-NEXT: kmovd %k0, %eax 1155; AVX512-NEXT: movb %al, (%rdi) 1156; AVX512-NEXT: retq 1157 %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 1158 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 1159 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 1160 %res = sext <4 x i1> %obit to <4 x i32> 1161 store <4 x i1> %val, ptr %p2 1162 ret <4 x i32> %res 1163} 1164 1165define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { 1166; SSE2-LABEL: usubo_v2i128: 1167; SSE2: # %bb.0: 1168; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1169; SSE2-NEXT: xorl %r10d, %r10d 1170; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1171; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1172; SSE2-NEXT: movl $0, %r11d 1173; SSE2-NEXT: sbbl %r11d, %r11d 1174; SSE2-NEXT: subq %r8, %rdi 1175; SSE2-NEXT: sbbq %r9, %rsi 1176; SSE2-NEXT: movd %r11d, %xmm1 1177; SSE2-NEXT: sbbl %r10d, %r10d 1178; SSE2-NEXT: movd %r10d, %xmm0 1179; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1180; SSE2-NEXT: movq %rdx, 16(%rax) 1181; SSE2-NEXT: movq %rdi, (%rax) 1182; SSE2-NEXT: movq %rcx, 24(%rax) 1183; SSE2-NEXT: movq %rsi, 8(%rax) 1184; SSE2-NEXT: retq 1185; 1186; SSSE3-LABEL: usubo_v2i128: 1187; SSSE3: # %bb.0: 1188; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax 1189; SSSE3-NEXT: xorl %r10d, %r10d 1190; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1191; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1192; SSSE3-NEXT: movl $0, %r11d 1193; SSSE3-NEXT: sbbl %r11d, %r11d 1194; SSSE3-NEXT: subq %r8, %rdi 1195; SSSE3-NEXT: sbbq %r9, %rsi 1196; SSSE3-NEXT: movd %r11d, %xmm1 1197; SSSE3-NEXT: sbbl %r10d, %r10d 1198; SSSE3-NEXT: movd %r10d, %xmm0 1199; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1200; SSSE3-NEXT: movq %rdx, 16(%rax) 1201; SSSE3-NEXT: movq %rdi, (%rax) 1202; SSSE3-NEXT: movq %rcx, 24(%rax) 1203; SSSE3-NEXT: movq %rsi, 8(%rax) 1204; SSSE3-NEXT: retq 1205; 1206; SSE41-LABEL: usubo_v2i128: 1207; SSE41: # %bb.0: 1208; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax 1209; SSE41-NEXT: xorl %r10d, %r10d 1210; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1211; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1212; SSE41-NEXT: movl $0, %r11d 1213; SSE41-NEXT: sbbl %r11d, %r11d 1214; SSE41-NEXT: subq %r8, %rdi 1215; SSE41-NEXT: sbbq %r9, %rsi 1216; SSE41-NEXT: sbbl %r10d, %r10d 1217; SSE41-NEXT: movd %r10d, %xmm0 1218; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 1219; SSE41-NEXT: movq %rdx, 16(%rax) 1220; SSE41-NEXT: movq %rdi, (%rax) 1221; SSE41-NEXT: movq %rcx, 24(%rax) 1222; SSE41-NEXT: movq %rsi, 8(%rax) 1223; SSE41-NEXT: retq 1224; 1225; AVX-LABEL: usubo_v2i128: 1226; AVX: # %bb.0: 1227; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1228; AVX-NEXT: xorl %r10d, %r10d 1229; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1230; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1231; AVX-NEXT: movl $0, %r11d 1232; AVX-NEXT: sbbl %r11d, %r11d 1233; AVX-NEXT: subq %r8, %rdi 1234; AVX-NEXT: sbbq %r9, %rsi 1235; AVX-NEXT: sbbl %r10d, %r10d 1236; AVX-NEXT: vmovd %r10d, %xmm0 1237; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 1238; AVX-NEXT: movq %rdx, 16(%rax) 1239; AVX-NEXT: movq %rdi, (%rax) 1240; AVX-NEXT: movq %rcx, 24(%rax) 1241; AVX-NEXT: movq %rsi, 8(%rax) 1242; AVX-NEXT: retq 1243; 1244; AVX512-LABEL: usubo_v2i128: 1245; AVX512: # %bb.0: 1246; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1247; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1248; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1249; AVX512-NEXT: setb %r10b 1250; AVX512-NEXT: kmovd %r10d, %k0 1251; AVX512-NEXT: subq %r8, %rdi 1252; AVX512-NEXT: sbbq %r9, %rsi 1253; AVX512-NEXT: setb %r8b 1254; AVX512-NEXT: andl $1, %r8d 1255; AVX512-NEXT: kmovw %r8d, %k1 1256; AVX512-NEXT: kshiftlw $1, %k0, %k0 1257; AVX512-NEXT: korw %k0, %k1, %k1 1258; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1259; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1260; AVX512-NEXT: movq %rdx, 16(%rax) 1261; AVX512-NEXT: movq %rdi, (%rax) 1262; AVX512-NEXT: movq %rcx, 24(%rax) 1263; AVX512-NEXT: movq %rsi, 8(%rax) 1264; AVX512-NEXT: retq 1265 %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 1266 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 1267 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 1268 %res = sext <2 x i1> %obit to <2 x i32> 1269 store <2 x i128> %val, ptr %p2 1270 ret <2 x i32> %res 1271} 1272