1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 11 12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) 13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>) 14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>) 15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) 16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>) 17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>) 18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>) 19 20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>) 21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>) 22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>) 23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>) 24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>) 25 26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>) 27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>) 28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>) 29 30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind { 31; CHECK-LABEL: umulo_v1i32: 32; CHECK: # %bb.0: 33; CHECK-NEXT: movq %rdx, %rcx 34; CHECK-NEXT: movl %edi, %eax 35; CHECK-NEXT: xorl %edi, %edi 36; CHECK-NEXT: mull %esi 37; CHECK-NEXT: seto %dil 38; CHECK-NEXT: negl %edi 39; CHECK-NEXT: movl %eax, (%rcx) 40; CHECK-NEXT: movl %edi, %eax 41; CHECK-NEXT: retq 42 %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 43 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 44 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 45 %res = sext <1 x i1> %obit to <1 x i32> 46 store <1 x i32> %val, ptr %p2 47 ret <1 x i32> %res 48} 49 50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { 51; SSE2-LABEL: umulo_v2i32: 52; SSE2: # %bb.0: 53; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 54; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 55; SSE2-NEXT: pmuludq %xmm1, %xmm2 56; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 57; SSE2-NEXT: pxor %xmm1, %xmm1 58; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 59; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 60; SSE2-NEXT: pxor %xmm1, %xmm0 61; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 62; SSE2-NEXT: movq %xmm1, (%rdi) 63; SSE2-NEXT: retq 64; 65; SSSE3-LABEL: umulo_v2i32: 66; SSSE3: # %bb.0: 67; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 68; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 69; SSSE3-NEXT: pmuludq %xmm1, %xmm2 70; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 71; SSSE3-NEXT: pxor %xmm1, %xmm1 72; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 73; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 74; SSSE3-NEXT: pxor %xmm1, %xmm0 75; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 76; SSSE3-NEXT: movq %xmm1, (%rdi) 77; SSSE3-NEXT: retq 78; 79; SSE41-LABEL: umulo_v2i32: 80; SSE41: # %bb.0: 81; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 82; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 83; SSE41-NEXT: pmuludq %xmm1, %xmm2 84; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 85; SSE41-NEXT: pxor %xmm1, %xmm1 86; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 87; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 88; SSE41-NEXT: pxor %xmm1, %xmm0 89; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 90; SSE41-NEXT: movq %xmm1, (%rdi) 91; SSE41-NEXT: retq 92; 93; AVX-LABEL: umulo_v2i32: 94; AVX: # %bb.0: 95; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 96; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 97; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 98; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 99; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 100; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 101; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 102; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 103; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 104; AVX-NEXT: vmovq %xmm1, (%rdi) 105; AVX-NEXT: retq 106; 107; AVX512-LABEL: umulo_v2i32: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 110; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 111; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 112; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 113; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 114; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 115; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 116; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 117; AVX512-NEXT: vmovq %xmm1, (%rdi) 118; AVX512-NEXT: retq 119 %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 120 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 121 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 122 %res = sext <2 x i1> %obit to <2 x i32> 123 store <2 x i32> %val, ptr %p2 124 ret <2 x i32> %res 125} 126 127define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { 128; SSE2-LABEL: umulo_v3i32: 129; SSE2: # %bb.0: 130; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 131; SSE2-NEXT: pmuludq %xmm1, %xmm0 132; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 133; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 134; SSE2-NEXT: pmuludq %xmm2, %xmm4 135; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 136; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 137; SSE2-NEXT: pxor %xmm2, %xmm2 138; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 139; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 140; SSE2-NEXT: pxor %xmm2, %xmm1 141; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 142; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 143; SSE2-NEXT: movd %xmm2, 8(%rdi) 144; SSE2-NEXT: movq %xmm0, (%rdi) 145; SSE2-NEXT: movdqa %xmm1, %xmm0 146; SSE2-NEXT: retq 147; 148; SSSE3-LABEL: umulo_v3i32: 149; SSSE3: # %bb.0: 150; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 151; SSSE3-NEXT: pmuludq %xmm1, %xmm0 152; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 153; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 154; SSSE3-NEXT: pmuludq %xmm2, %xmm4 155; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 156; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 157; SSSE3-NEXT: pxor %xmm2, %xmm2 158; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 159; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 160; SSSE3-NEXT: pxor %xmm2, %xmm1 161; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 162; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 163; SSSE3-NEXT: movd %xmm2, 8(%rdi) 164; SSSE3-NEXT: movq %xmm0, (%rdi) 165; SSSE3-NEXT: movdqa %xmm1, %xmm0 166; SSSE3-NEXT: retq 167; 168; SSE41-LABEL: umulo_v3i32: 169; SSE41: # %bb.0: 170; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 171; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 172; SSE41-NEXT: pmuludq %xmm2, %xmm3 173; SSE41-NEXT: movdqa %xmm0, %xmm2 174; SSE41-NEXT: pmuludq %xmm1, %xmm2 175; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 176; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 177; SSE41-NEXT: pxor %xmm3, %xmm3 178; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 179; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 180; SSE41-NEXT: pxor %xmm3, %xmm2 181; SSE41-NEXT: pmulld %xmm1, %xmm0 182; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) 183; SSE41-NEXT: movq %xmm0, (%rdi) 184; SSE41-NEXT: movdqa %xmm2, %xmm0 185; SSE41-NEXT: retq 186; 187; AVX1-LABEL: umulo_v3i32: 188; AVX1: # %bb.0: 189; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 190; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 191; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 192; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 193; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 194; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 195; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 196; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 197; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 198; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 199; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 200; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) 201; AVX1-NEXT: vmovq %xmm0, (%rdi) 202; AVX1-NEXT: vmovdqa %xmm2, %xmm0 203; AVX1-NEXT: retq 204; 205; AVX2-LABEL: umulo_v3i32: 206; AVX2: # %bb.0: 207; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 208; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 209; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 210; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 211; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 212; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 213; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 214; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 215; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 216; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 217; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 218; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) 219; AVX2-NEXT: vmovq %xmm0, (%rdi) 220; AVX2-NEXT: vmovdqa %xmm2, %xmm0 221; AVX2-NEXT: retq 222; 223; AVX512-LABEL: umulo_v3i32: 224; AVX512: # %bb.0: 225; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 226; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 227; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 228; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 229; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] 230; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 231; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 232; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 233; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 234; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 235; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 236; AVX512-NEXT: vmovq %xmm1, (%rdi) 237; AVX512-NEXT: retq 238 %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 239 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 240 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 241 %res = sext <3 x i1> %obit to <3 x i32> 242 store <3 x i32> %val, ptr %p2 243 ret <3 x i32> %res 244} 245 246define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { 247; SSE2-LABEL: umulo_v4i32: 248; SSE2: # %bb.0: 249; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 250; SSE2-NEXT: pmuludq %xmm1, %xmm0 251; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 252; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 253; SSE2-NEXT: pmuludq %xmm2, %xmm4 254; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 255; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 256; SSE2-NEXT: pxor %xmm2, %xmm2 257; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 258; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 259; SSE2-NEXT: pxor %xmm2, %xmm1 260; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 261; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 262; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 263; SSE2-NEXT: movdqa %xmm0, (%rdi) 264; SSE2-NEXT: movdqa %xmm1, %xmm0 265; SSE2-NEXT: retq 266; 267; SSSE3-LABEL: umulo_v4i32: 268; SSSE3: # %bb.0: 269; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 270; SSSE3-NEXT: pmuludq %xmm1, %xmm0 271; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 272; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 273; SSSE3-NEXT: pmuludq %xmm2, %xmm4 274; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 275; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 276; SSSE3-NEXT: pxor %xmm2, %xmm2 277; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 278; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 279; SSSE3-NEXT: pxor %xmm2, %xmm1 280; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 281; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 282; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 283; SSSE3-NEXT: movdqa %xmm0, (%rdi) 284; SSSE3-NEXT: movdqa %xmm1, %xmm0 285; SSSE3-NEXT: retq 286; 287; SSE41-LABEL: umulo_v4i32: 288; SSE41: # %bb.0: 289; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 290; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 291; SSE41-NEXT: pmuludq %xmm2, %xmm3 292; SSE41-NEXT: movdqa %xmm0, %xmm2 293; SSE41-NEXT: pmuludq %xmm1, %xmm2 294; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 295; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 296; SSE41-NEXT: pxor %xmm3, %xmm3 297; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 298; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 299; SSE41-NEXT: pxor %xmm3, %xmm2 300; SSE41-NEXT: pmulld %xmm1, %xmm0 301; SSE41-NEXT: movdqa %xmm0, (%rdi) 302; SSE41-NEXT: movdqa %xmm2, %xmm0 303; SSE41-NEXT: retq 304; 305; AVX1-LABEL: umulo_v4i32: 306; AVX1: # %bb.0: 307; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 308; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 309; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 310; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 311; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 312; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 313; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 314; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 315; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 316; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 317; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 318; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 319; AVX1-NEXT: vmovdqa %xmm2, %xmm0 320; AVX1-NEXT: retq 321; 322; AVX2-LABEL: umulo_v4i32: 323; AVX2: # %bb.0: 324; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 325; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 326; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 327; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 328; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 329; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 330; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 331; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 332; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 333; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 334; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 335; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 336; AVX2-NEXT: vmovdqa %xmm2, %xmm0 337; AVX2-NEXT: retq 338; 339; AVX512-LABEL: umulo_v4i32: 340; AVX512: # %bb.0: 341; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 342; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 343; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 344; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 345; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] 346; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 347; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 348; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 349; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 350; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 351; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 352; AVX512-NEXT: retq 353 %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 354 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 355 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 356 %res = sext <4 x i1> %obit to <4 x i32> 357 store <4 x i32> %val, ptr %p2 358 ret <4 x i32> %res 359} 360 361define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { 362; SSE2-LABEL: umulo_v6i32: 363; SSE2: # %bb.0: 364; SSE2-NEXT: movq %rdi, %rax 365; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 366; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 367; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 368; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 369; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 370; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 371; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 372; SSE2-NEXT: movd %r8d, %xmm0 373; SSE2-NEXT: movd %ecx, %xmm1 374; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 375; SSE2-NEXT: movd %edx, %xmm0 376; SSE2-NEXT: movd %esi, %xmm3 377; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 378; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 379; SSE2-NEXT: movd %r9d, %xmm1 380; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 381; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 382; SSE2-NEXT: pmuludq %xmm1, %xmm0 383; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 384; SSE2-NEXT: pmuludq %xmm2, %xmm3 385; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 386; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 387; SSE2-NEXT: pmuludq %xmm4, %xmm2 388; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 389; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 390; SSE2-NEXT: pxor %xmm4, %xmm4 391; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 392; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 393; SSE2-NEXT: pxor %xmm5, %xmm1 394; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 395; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 396; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 397; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 398; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 399; SSE2-NEXT: pmuludq %xmm2, %xmm6 400; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 401; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 402; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 403; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 404; SSE2-NEXT: pxor %xmm5, %xmm7 405; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 406; SSE2-NEXT: movq %xmm0, 16(%rcx) 407; SSE2-NEXT: movdqa %xmm3, (%rcx) 408; SSE2-NEXT: movq %xmm7, 16(%rdi) 409; SSE2-NEXT: movdqa %xmm1, (%rdi) 410; SSE2-NEXT: retq 411; 412; SSSE3-LABEL: umulo_v6i32: 413; SSSE3: # %bb.0: 414; SSSE3-NEXT: movq %rdi, %rax 415; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 416; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 417; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 418; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 419; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 420; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 421; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 422; SSSE3-NEXT: movd %r8d, %xmm0 423; SSSE3-NEXT: movd %ecx, %xmm1 424; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 425; SSSE3-NEXT: movd %edx, %xmm0 426; SSSE3-NEXT: movd %esi, %xmm3 427; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 428; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 429; SSSE3-NEXT: movd %r9d, %xmm1 430; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 431; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 432; SSSE3-NEXT: pmuludq %xmm1, %xmm0 433; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 434; SSSE3-NEXT: pmuludq %xmm2, %xmm3 435; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 436; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 437; SSSE3-NEXT: pmuludq %xmm4, %xmm2 438; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 439; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 440; SSSE3-NEXT: pxor %xmm4, %xmm4 441; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 442; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 443; SSSE3-NEXT: pxor %xmm5, %xmm1 444; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 445; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 446; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 447; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 448; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 449; SSSE3-NEXT: pmuludq %xmm2, %xmm6 450; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 451; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 452; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 453; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 454; SSSE3-NEXT: pxor %xmm5, %xmm7 455; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 456; SSSE3-NEXT: movq %xmm0, 16(%rcx) 457; SSSE3-NEXT: movdqa %xmm3, (%rcx) 458; SSSE3-NEXT: movq %xmm7, 16(%rdi) 459; SSSE3-NEXT: movdqa %xmm1, (%rdi) 460; SSSE3-NEXT: retq 461; 462; SSE41-LABEL: umulo_v6i32: 463; SSE41: # %bb.0: 464; SSE41-NEXT: movq %rdi, %rax 465; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edi 466; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 467; SSE41-NEXT: movd %r9d, %xmm1 468; SSE41-NEXT: movdqa %xmm1, %xmm0 469; SSE41-NEXT: pmuludq %xmm2, %xmm1 470; SSE41-NEXT: pinsrd $1, %edi, %xmm2 471; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %r9d 472; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 473; SSE41-NEXT: pmulld %xmm2, %xmm0 474; SSE41-NEXT: movd %esi, %xmm2 475; SSE41-NEXT: pinsrd $1, %edx, %xmm2 476; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 477; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 478; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 479; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 480; SSE41-NEXT: movdqa %xmm3, %xmm4 481; SSE41-NEXT: pmuludq %xmm2, %xmm3 482; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 483; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm4 484; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 485; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 486; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] 487; SSE41-NEXT: pmuludq %xmm5, %xmm6 488; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 489; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] 490; SSE41-NEXT: pxor %xmm5, %xmm5 491; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 492; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 493; SSE41-NEXT: pxor %xmm6, %xmm3 494; SSE41-NEXT: movd %edi, %xmm7 495; SSE41-NEXT: movd %r9d, %xmm8 496; SSE41-NEXT: pmuludq %xmm7, %xmm8 497; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 498; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] 499; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 500; SSE41-NEXT: pxor %xmm6, %xmm1 501; SSE41-NEXT: pmulld %xmm2, %xmm4 502; SSE41-NEXT: movq %xmm0, 16(%rcx) 503; SSE41-NEXT: movdqa %xmm4, (%rcx) 504; SSE41-NEXT: movq %xmm1, 16(%rax) 505; SSE41-NEXT: movdqa %xmm3, (%rax) 506; SSE41-NEXT: retq 507; 508; AVX1-LABEL: umulo_v6i32: 509; AVX1: # %bb.0: 510; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 511; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 512; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 513; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 514; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 515; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 516; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 517; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 518; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 519; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 520; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 521; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 522; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 523; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 524; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 525; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 526; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 527; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] 528; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 529; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 530; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 531; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 532; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 533; AVX1-NEXT: vmovq %xmm1, 16(%rdi) 534; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 535; AVX1-NEXT: vmovaps %ymm2, %ymm0 536; AVX1-NEXT: retq 537; 538; AVX2-LABEL: umulo_v6i32: 539; AVX2: # %bb.0: 540; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 541; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 542; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 543; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 544; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 545; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 546; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 547; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 548; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 549; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 550; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 551; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 552; AVX2-NEXT: vmovq %xmm1, 16(%rdi) 553; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 554; AVX2-NEXT: vmovdqa %ymm2, %ymm0 555; AVX2-NEXT: retq 556; 557; AVX512-LABEL: umulo_v6i32: 558; AVX512: # %bb.0: 559; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 560; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 561; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 562; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 563; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 564; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 565; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 566; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 567; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 568; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 569; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 570; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 571; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 572; AVX512-NEXT: retq 573 %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 574 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 575 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 576 %res = sext <6 x i1> %obit to <6 x i32> 577 store <6 x i32> %val, ptr %p2 578 ret <6 x i32> %res 579} 580 581define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { 582; SSE2-LABEL: umulo_v8i32: 583; SSE2: # %bb.0: 584; SSE2-NEXT: movdqa %xmm0, %xmm4 585; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 586; SSE2-NEXT: pmuludq %xmm2, %xmm4 587; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 588; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 589; SSE2-NEXT: pmuludq %xmm5, %xmm6 590; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 591; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 592; SSE2-NEXT: pxor %xmm5, %xmm5 593; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 594; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 595; SSE2-NEXT: pxor %xmm7, %xmm0 596; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] 597; SSE2-NEXT: pmuludq %xmm3, %xmm1 598; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 599; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 600; SSE2-NEXT: pmuludq %xmm8, %xmm3 601; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] 602; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] 603; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 604; SSE2-NEXT: pxor %xmm7, %xmm2 605; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 606; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 607; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 608; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 609; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 610; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 611; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 612; SSE2-NEXT: movdqa %xmm4, (%rdi) 613; SSE2-NEXT: movdqa %xmm2, %xmm1 614; SSE2-NEXT: retq 615; 616; SSSE3-LABEL: umulo_v8i32: 617; SSSE3: # %bb.0: 618; SSSE3-NEXT: movdqa %xmm0, %xmm4 619; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 620; SSSE3-NEXT: pmuludq %xmm2, %xmm4 621; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 622; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 623; SSSE3-NEXT: pmuludq %xmm5, %xmm6 624; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 625; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 626; SSSE3-NEXT: pxor %xmm5, %xmm5 627; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 628; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 629; SSSE3-NEXT: pxor %xmm7, %xmm0 630; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] 631; SSSE3-NEXT: pmuludq %xmm3, %xmm1 632; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 633; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 634; SSSE3-NEXT: pmuludq %xmm8, %xmm3 635; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] 636; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] 637; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 638; SSSE3-NEXT: pxor %xmm7, %xmm2 639; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 640; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 641; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 642; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 643; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 644; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 645; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 646; SSSE3-NEXT: movdqa %xmm4, (%rdi) 647; SSSE3-NEXT: movdqa %xmm2, %xmm1 648; SSSE3-NEXT: retq 649; 650; SSE41-LABEL: umulo_v8i32: 651; SSE41: # %bb.0: 652; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 653; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 654; SSE41-NEXT: pmuludq %xmm4, %xmm5 655; SSE41-NEXT: movdqa %xmm0, %xmm4 656; SSE41-NEXT: pmuludq %xmm2, %xmm4 657; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 658; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 659; SSE41-NEXT: pxor %xmm6, %xmm6 660; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 661; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 662; SSE41-NEXT: pxor %xmm7, %xmm4 663; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 664; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] 665; SSE41-NEXT: pmuludq %xmm5, %xmm8 666; SSE41-NEXT: movdqa %xmm1, %xmm5 667; SSE41-NEXT: pmuludq %xmm3, %xmm5 668; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 669; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7] 670; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 671; SSE41-NEXT: pxor %xmm7, %xmm5 672; SSE41-NEXT: pmulld %xmm2, %xmm0 673; SSE41-NEXT: pmulld %xmm3, %xmm1 674; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 675; SSE41-NEXT: movdqa %xmm0, (%rdi) 676; SSE41-NEXT: movdqa %xmm4, %xmm0 677; SSE41-NEXT: movdqa %xmm5, %xmm1 678; SSE41-NEXT: retq 679; 680; AVX1-LABEL: umulo_v8i32: 681; AVX1: # %bb.0: 682; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 683; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 684; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 685; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 686; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 687; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 688; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 689; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 690; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 691; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 692; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 693; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 694; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 695; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 696; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 697; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 698; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 699; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] 700; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 701; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 702; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 703; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 704; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 705; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) 706; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 707; AVX1-NEXT: vmovaps %ymm2, %ymm0 708; AVX1-NEXT: retq 709; 710; AVX2-LABEL: umulo_v8i32: 711; AVX2: # %bb.0: 712; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 713; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 714; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 715; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 716; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 717; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 718; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 719; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 720; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 721; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 722; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 723; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 724; AVX2-NEXT: vmovdqa %ymm2, %ymm0 725; AVX2-NEXT: retq 726; 727; AVX512-LABEL: umulo_v8i32: 728; AVX512: # %bb.0: 729; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 730; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 731; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 732; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 733; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 734; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 735; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 736; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 737; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 738; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 739; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 740; AVX512-NEXT: retq 741 %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 742 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 743 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 744 %res = sext <8 x i1> %obit to <8 x i32> 745 store <8 x i32> %val, ptr %p2 746 ret <8 x i32> %res 747} 748 749define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { 750; SSE2-LABEL: umulo_v16i32: 751; SSE2: # %bb.0: 752; SSE2-NEXT: movdqa %xmm0, %xmm9 753; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 754; SSE2-NEXT: pmuludq %xmm4, %xmm9 755; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] 756; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3] 757; SSE2-NEXT: pmuludq %xmm8, %xmm10 758; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] 759; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 760; SSE2-NEXT: pxor %xmm11, %xmm11 761; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 762; SSE2-NEXT: pcmpeqd %xmm12, %xmm12 763; SSE2-NEXT: pxor %xmm12, %xmm0 764; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] 765; SSE2-NEXT: pmuludq %xmm5, %xmm1 766; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] 767; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 768; SSE2-NEXT: pmuludq %xmm8, %xmm5 769; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] 770; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] 771; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 772; SSE2-NEXT: pxor %xmm12, %xmm4 773; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 774; SSE2-NEXT: pmuludq %xmm6, %xmm2 775; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] 776; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 777; SSE2-NEXT: pmuludq %xmm14, %xmm13 778; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 779; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 780; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 781; SSE2-NEXT: pxor %xmm12, %xmm8 782; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 783; SSE2-NEXT: pmuludq %xmm7, %xmm3 784; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 785; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 786; SSE2-NEXT: pmuludq %xmm14, %xmm7 787; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] 788; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] 789; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 790; SSE2-NEXT: pxor %xmm12, %xmm6 791; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] 792; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 793; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] 794; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 795; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 796; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 797; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 798; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] 799; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 800; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 801; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 802; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 803; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 804; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 805; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 806; SSE2-NEXT: movdqa %xmm9, (%rdi) 807; SSE2-NEXT: movdqa %xmm4, %xmm1 808; SSE2-NEXT: movdqa %xmm8, %xmm2 809; SSE2-NEXT: movdqa %xmm6, %xmm3 810; SSE2-NEXT: retq 811; 812; SSSE3-LABEL: umulo_v16i32: 813; SSSE3: # %bb.0: 814; SSSE3-NEXT: movdqa %xmm0, %xmm9 815; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 816; SSSE3-NEXT: pmuludq %xmm4, %xmm9 817; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] 818; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3] 819; SSSE3-NEXT: pmuludq %xmm8, %xmm10 820; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] 821; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 822; SSSE3-NEXT: pxor %xmm11, %xmm11 823; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 824; SSSE3-NEXT: pcmpeqd %xmm12, %xmm12 825; SSSE3-NEXT: pxor %xmm12, %xmm0 826; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] 827; SSSE3-NEXT: pmuludq %xmm5, %xmm1 828; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] 829; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 830; SSSE3-NEXT: pmuludq %xmm8, %xmm5 831; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] 832; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] 833; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 834; SSSE3-NEXT: pxor %xmm12, %xmm4 835; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 836; SSSE3-NEXT: pmuludq %xmm6, %xmm2 837; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] 838; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 839; SSSE3-NEXT: pmuludq %xmm14, %xmm13 840; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 841; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 842; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 843; SSSE3-NEXT: pxor %xmm12, %xmm8 844; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 845; SSSE3-NEXT: pmuludq %xmm7, %xmm3 846; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 847; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 848; SSSE3-NEXT: pmuludq %xmm14, %xmm7 849; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] 850; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] 851; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 852; SSSE3-NEXT: pxor %xmm12, %xmm6 853; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] 854; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 855; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] 856; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 857; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 858; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 859; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 860; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] 861; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 862; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 863; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 864; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 865; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 866; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 867; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 868; SSSE3-NEXT: movdqa %xmm9, (%rdi) 869; SSSE3-NEXT: movdqa %xmm4, %xmm1 870; SSSE3-NEXT: movdqa %xmm8, %xmm2 871; SSSE3-NEXT: movdqa %xmm6, %xmm3 872; SSSE3-NEXT: retq 873; 874; SSE41-LABEL: umulo_v16i32: 875; SSE41: # %bb.0: 876; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] 877; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] 878; SSE41-NEXT: pmuludq %xmm8, %xmm9 879; SSE41-NEXT: movdqa %xmm0, %xmm8 880; SSE41-NEXT: pmuludq %xmm4, %xmm8 881; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 882; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 883; SSE41-NEXT: pxor %xmm12, %xmm12 884; SSE41-NEXT: pcmpeqd %xmm12, %xmm8 885; SSE41-NEXT: pcmpeqd %xmm13, %xmm13 886; SSE41-NEXT: pxor %xmm13, %xmm8 887; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 888; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] 889; SSE41-NEXT: pmuludq %xmm9, %xmm10 890; SSE41-NEXT: movdqa %xmm1, %xmm9 891; SSE41-NEXT: pmuludq %xmm5, %xmm9 892; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] 893; SSE41-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7] 894; SSE41-NEXT: pcmpeqd %xmm12, %xmm9 895; SSE41-NEXT: pxor %xmm13, %xmm9 896; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] 897; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] 898; SSE41-NEXT: pmuludq %xmm10, %xmm11 899; SSE41-NEXT: movdqa %xmm2, %xmm10 900; SSE41-NEXT: pmuludq %xmm6, %xmm10 901; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] 902; SSE41-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] 903; SSE41-NEXT: pcmpeqd %xmm12, %xmm10 904; SSE41-NEXT: pxor %xmm13, %xmm10 905; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3] 906; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 907; SSE41-NEXT: pmuludq %xmm11, %xmm14 908; SSE41-NEXT: movdqa %xmm3, %xmm11 909; SSE41-NEXT: pmuludq %xmm7, %xmm11 910; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] 911; SSE41-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7] 912; SSE41-NEXT: pcmpeqd %xmm12, %xmm11 913; SSE41-NEXT: pxor %xmm13, %xmm11 914; SSE41-NEXT: pmulld %xmm4, %xmm0 915; SSE41-NEXT: pmulld %xmm5, %xmm1 916; SSE41-NEXT: pmulld %xmm6, %xmm2 917; SSE41-NEXT: pmulld %xmm7, %xmm3 918; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 919; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 920; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 921; SSE41-NEXT: movdqa %xmm0, (%rdi) 922; SSE41-NEXT: movdqa %xmm8, %xmm0 923; SSE41-NEXT: movdqa %xmm9, %xmm1 924; SSE41-NEXT: movdqa %xmm10, %xmm2 925; SSE41-NEXT: movdqa %xmm11, %xmm3 926; SSE41-NEXT: retq 927; 928; AVX1-LABEL: umulo_v16i32: 929; AVX1: # %bb.0: 930; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 931; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] 932; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 933; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] 934; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 935; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm7 936; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 937; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] 938; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 939; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 940; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] 941; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] 942; AVX1-NEXT: vpmuludq %xmm8, %xmm9, %xmm8 943; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm9 944; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] 945; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7] 946; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm8 947; AVX1-NEXT: vpackssdw %xmm6, %xmm8, %xmm6 948; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 949; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] 950; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 951; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] 952; AVX1-NEXT: vpmuludq %xmm9, %xmm11, %xmm9 953; AVX1-NEXT: vpmuludq %xmm8, %xmm10, %xmm11 954; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] 955; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7] 956; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm9 957; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] 958; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] 959; AVX1-NEXT: vpmuludq %xmm11, %xmm12, %xmm11 960; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm12 961; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] 962; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5],xmm11[6,7] 963; AVX1-NEXT: vpcmpeqd %xmm7, %xmm11, %xmm7 964; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm7 965; AVX1-NEXT: vpacksswb %xmm6, %xmm7, %xmm7 966; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 967; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 968; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 969; AVX1-NEXT: vpmulld %xmm8, %xmm10, %xmm8 970; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 971; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 972; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 973; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] 974; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 975; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 976; AVX1-NEXT: vpacksswb %xmm6, %xmm6, %xmm1 977; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1 978; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 979; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 980; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 981; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 982; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) 983; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 984; AVX1-NEXT: vmovdqa %xmm8, 16(%rdi) 985; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 986; AVX1-NEXT: retq 987; 988; AVX2-LABEL: umulo_v16i32: 989; AVX2: # %bb.0: 990; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] 991; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] 992; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm4 993; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 994; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7] 995; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] 996; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 997; AVX2-NEXT: vpcmpeqd %ymm5, %ymm4, %ymm4 998; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 999; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 1000; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 1001; AVX2-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 1002; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7] 1003; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7] 1004; AVX2-NEXT: vpmuludq %ymm7, %ymm8, %ymm7 1005; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm8 1006; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7] 1007; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] 1008; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 1009; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 1010; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 1011; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm0 1012; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 1013; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] 1014; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 1015; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 1016; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 1017; AVX2-NEXT: retq 1018; 1019; AVX512-LABEL: umulo_v16i32: 1020; AVX512: # %bb.0: 1021; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 1022; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1023; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1024; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 1025; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 1026; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1027; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 1028; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 1029; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 1030; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 1031; AVX512-NEXT: retq 1032 %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 1033 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 1034 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 1035 %res = sext <16 x i1> %obit to <16 x i32> 1036 store <16 x i32> %val, ptr %p2 1037 ret <16 x i32> %res 1038} 1039 1040define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { 1041; SSE2-LABEL: umulo_v16i8: 1042; SSE2: # %bb.0: 1043; SSE2-NEXT: pxor %xmm2, %xmm2 1044; SSE2-NEXT: movdqa %xmm1, %xmm3 1045; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1046; SSE2-NEXT: movdqa %xmm0, %xmm5 1047; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1048; SSE2-NEXT: pmullw %xmm3, %xmm5 1049; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1050; SSE2-NEXT: movdqa %xmm5, %xmm3 1051; SSE2-NEXT: pand %xmm4, %xmm3 1052; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1053; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1054; SSE2-NEXT: pmullw %xmm1, %xmm0 1055; SSE2-NEXT: pand %xmm0, %xmm4 1056; SSE2-NEXT: packuswb %xmm3, %xmm4 1057; SSE2-NEXT: psrlw $8, %xmm5 1058; SSE2-NEXT: psrlw $8, %xmm0 1059; SSE2-NEXT: packuswb %xmm5, %xmm0 1060; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 1061; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1062; SSE2-NEXT: pxor %xmm2, %xmm3 1063; SSE2-NEXT: movdqa %xmm3, %xmm0 1064; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1065; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1066; SSE2-NEXT: movdqa %xmm3, %xmm1 1067; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1068; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1069; SSE2-NEXT: pslld $31, %xmm1 1070; SSE2-NEXT: psrad $31, %xmm1 1071; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1072; SSE2-NEXT: movdqa %xmm3, %xmm2 1073; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1074; SSE2-NEXT: pslld $31, %xmm2 1075; SSE2-NEXT: psrad $31, %xmm2 1076; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1077; SSE2-NEXT: pslld $31, %xmm3 1078; SSE2-NEXT: psrad $31, %xmm3 1079; SSE2-NEXT: movdqa %xmm4, (%rdi) 1080; SSE2-NEXT: retq 1081; 1082; SSSE3-LABEL: umulo_v16i8: 1083; SSSE3: # %bb.0: 1084; SSSE3-NEXT: pxor %xmm2, %xmm2 1085; SSSE3-NEXT: movdqa %xmm1, %xmm3 1086; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1087; SSSE3-NEXT: movdqa %xmm0, %xmm5 1088; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1089; SSSE3-NEXT: pmullw %xmm3, %xmm5 1090; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1091; SSSE3-NEXT: movdqa %xmm5, %xmm3 1092; SSSE3-NEXT: pand %xmm4, %xmm3 1093; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1094; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1095; SSSE3-NEXT: pmullw %xmm1, %xmm0 1096; SSSE3-NEXT: pand %xmm0, %xmm4 1097; SSSE3-NEXT: packuswb %xmm3, %xmm4 1098; SSSE3-NEXT: psrlw $8, %xmm5 1099; SSSE3-NEXT: psrlw $8, %xmm0 1100; SSSE3-NEXT: packuswb %xmm5, %xmm0 1101; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 1102; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1103; SSSE3-NEXT: pxor %xmm2, %xmm3 1104; SSSE3-NEXT: movdqa %xmm3, %xmm0 1105; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1106; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1107; SSSE3-NEXT: movdqa %xmm3, %xmm1 1108; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1109; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1110; SSSE3-NEXT: pslld $31, %xmm1 1111; SSSE3-NEXT: psrad $31, %xmm1 1112; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1113; SSSE3-NEXT: movdqa %xmm3, %xmm2 1114; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1115; SSSE3-NEXT: pslld $31, %xmm2 1116; SSSE3-NEXT: psrad $31, %xmm2 1117; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1118; SSSE3-NEXT: pslld $31, %xmm3 1119; SSSE3-NEXT: psrad $31, %xmm3 1120; SSSE3-NEXT: movdqa %xmm4, (%rdi) 1121; SSSE3-NEXT: retq 1122; 1123; SSE41-LABEL: umulo_v16i8: 1124; SSE41: # %bb.0: 1125; SSE41-NEXT: pxor %xmm2, %xmm2 1126; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1127; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1128; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1129; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1130; SSE41-NEXT: pmullw %xmm1, %xmm0 1131; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1132; SSE41-NEXT: movdqa %xmm0, %xmm1 1133; SSE41-NEXT: pand %xmm4, %xmm1 1134; SSE41-NEXT: pmullw %xmm3, %xmm5 1135; SSE41-NEXT: pand %xmm5, %xmm4 1136; SSE41-NEXT: packuswb %xmm1, %xmm4 1137; SSE41-NEXT: psrlw $8, %xmm0 1138; SSE41-NEXT: psrlw $8, %xmm5 1139; SSE41-NEXT: packuswb %xmm0, %xmm5 1140; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 1141; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 1142; SSE41-NEXT: pxor %xmm5, %xmm3 1143; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 1144; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 1145; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1146; SSE41-NEXT: pslld $31, %xmm1 1147; SSE41-NEXT: psrad $31, %xmm1 1148; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1149; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1150; SSE41-NEXT: pslld $31, %xmm2 1151; SSE41-NEXT: psrad $31, %xmm2 1152; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 1153; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1154; SSE41-NEXT: pslld $31, %xmm3 1155; SSE41-NEXT: psrad $31, %xmm3 1156; SSE41-NEXT: movdqa %xmm4, (%rdi) 1157; SSE41-NEXT: retq 1158; 1159; AVX1-LABEL: umulo_v16i8: 1160; AVX1: # %bb.0: 1161; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1162; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1163; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1164; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1165; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1166; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5 1167; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1168; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1169; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1170; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 1171; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm4 1172; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1173; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1174; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1175; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1176; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1177; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1178; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1179; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1180; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1181; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1182; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1183; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1184; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1185; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1186; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1187; AVX1-NEXT: vmovdqa %xmm4, (%rdi) 1188; AVX1-NEXT: retq 1189; 1190; AVX2-LABEL: umulo_v16i8: 1191; AVX2: # %bb.0: 1192; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1193; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1194; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1195; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1196; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1197; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm2 1198; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1199; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1200; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1201; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1202; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1203; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1204; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 1205; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1206; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1207; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1208; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1209; AVX2-NEXT: retq 1210; 1211; AVX512F-LABEL: umulo_v16i8: 1212; AVX512F: # %bb.0: 1213; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1214; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1215; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1216; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm0 1217; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1218; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 1219; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 1220; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1221; AVX512F-NEXT: vpmovdb %zmm1, (%rdi) 1222; AVX512F-NEXT: retq 1223; 1224; AVX512BW-LABEL: umulo_v16i8: 1225; AVX512BW: # %bb.0: 1226; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1227; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1228; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1229; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm0 1230; AVX512BW-NEXT: vptestmw %ymm0, %ymm0, %k1 1231; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 1232; AVX512BW-NEXT: vpmovwb %ymm1, (%rdi) 1233; AVX512BW-NEXT: retq 1234 %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1235 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 1236 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 1237 %res = sext <16 x i1> %obit to <16 x i32> 1238 store <16 x i8> %val, ptr %p2 1239 ret <16 x i32> %res 1240} 1241 1242define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { 1243; SSE2-LABEL: umulo_v32i8: 1244; SSE2: # %bb.0: 1245; SSE2-NEXT: movq %rdi, %rax 1246; SSE2-NEXT: pxor %xmm5, %xmm5 1247; SSE2-NEXT: movdqa %xmm2, %xmm4 1248; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1249; SSE2-NEXT: movdqa %xmm0, %xmm6 1250; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1251; SSE2-NEXT: pmullw %xmm4, %xmm6 1252; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1253; SSE2-NEXT: movdqa %xmm6, %xmm7 1254; SSE2-NEXT: pand %xmm4, %xmm7 1255; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1256; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1257; SSE2-NEXT: pmullw %xmm2, %xmm0 1258; SSE2-NEXT: movdqa %xmm0, %xmm2 1259; SSE2-NEXT: pand %xmm4, %xmm2 1260; SSE2-NEXT: packuswb %xmm7, %xmm2 1261; SSE2-NEXT: movdqa %xmm3, %xmm7 1262; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1263; SSE2-NEXT: movdqa %xmm1, %xmm8 1264; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] 1265; SSE2-NEXT: pmullw %xmm7, %xmm8 1266; SSE2-NEXT: movdqa %xmm8, %xmm7 1267; SSE2-NEXT: pand %xmm4, %xmm7 1268; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1269; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1270; SSE2-NEXT: pmullw %xmm3, %xmm1 1271; SSE2-NEXT: pand %xmm1, %xmm4 1272; SSE2-NEXT: packuswb %xmm7, %xmm4 1273; SSE2-NEXT: psrlw $8, %xmm8 1274; SSE2-NEXT: psrlw $8, %xmm1 1275; SSE2-NEXT: packuswb %xmm8, %xmm1 1276; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 1277; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1278; SSE2-NEXT: pxor %xmm3, %xmm1 1279; SSE2-NEXT: psrlw $8, %xmm6 1280; SSE2-NEXT: psrlw $8, %xmm0 1281; SSE2-NEXT: packuswb %xmm6, %xmm0 1282; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 1283; SSE2-NEXT: pxor %xmm3, %xmm0 1284; SSE2-NEXT: movdqa %xmm0, %xmm3 1285; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1286; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1287; SSE2-NEXT: pslld $31, %xmm3 1288; SSE2-NEXT: psrad $31, %xmm3 1289; SSE2-NEXT: movdqa %xmm0, %xmm5 1290; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1291; SSE2-NEXT: movdqa %xmm5, %xmm6 1292; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] 1293; SSE2-NEXT: pslld $31, %xmm6 1294; SSE2-NEXT: psrad $31, %xmm6 1295; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1296; SSE2-NEXT: pslld $31, %xmm5 1297; SSE2-NEXT: psrad $31, %xmm5 1298; SSE2-NEXT: movdqa %xmm1, %xmm7 1299; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1300; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1301; SSE2-NEXT: pslld $31, %xmm7 1302; SSE2-NEXT: psrad $31, %xmm7 1303; SSE2-NEXT: movdqa %xmm1, %xmm8 1304; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1305; SSE2-NEXT: movdqa %xmm8, %xmm9 1306; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 1307; SSE2-NEXT: pslld $31, %xmm9 1308; SSE2-NEXT: psrad $31, %xmm9 1309; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 1310; SSE2-NEXT: pslld $31, %xmm8 1311; SSE2-NEXT: psrad $31, %xmm8 1312; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1313; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1314; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1315; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1316; SSE2-NEXT: movdqa %xmm4, 16(%rsi) 1317; SSE2-NEXT: movdqa %xmm2, (%rsi) 1318; SSE2-NEXT: movdqa %xmm1, 64(%rdi) 1319; SSE2-NEXT: movdqa %xmm0, (%rdi) 1320; SSE2-NEXT: movdqa %xmm8, 112(%rdi) 1321; SSE2-NEXT: movdqa %xmm9, 96(%rdi) 1322; SSE2-NEXT: movdqa %xmm7, 80(%rdi) 1323; SSE2-NEXT: movdqa %xmm5, 48(%rdi) 1324; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 1325; SSE2-NEXT: movdqa %xmm3, 16(%rdi) 1326; SSE2-NEXT: retq 1327; 1328; SSSE3-LABEL: umulo_v32i8: 1329; SSSE3: # %bb.0: 1330; SSSE3-NEXT: movq %rdi, %rax 1331; SSSE3-NEXT: pxor %xmm5, %xmm5 1332; SSSE3-NEXT: movdqa %xmm2, %xmm4 1333; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1334; SSSE3-NEXT: movdqa %xmm0, %xmm6 1335; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1336; SSSE3-NEXT: pmullw %xmm4, %xmm6 1337; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1338; SSSE3-NEXT: movdqa %xmm6, %xmm7 1339; SSSE3-NEXT: pand %xmm4, %xmm7 1340; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1341; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1342; SSSE3-NEXT: pmullw %xmm2, %xmm0 1343; SSSE3-NEXT: movdqa %xmm0, %xmm2 1344; SSSE3-NEXT: pand %xmm4, %xmm2 1345; SSSE3-NEXT: packuswb %xmm7, %xmm2 1346; SSSE3-NEXT: movdqa %xmm3, %xmm7 1347; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1348; SSSE3-NEXT: movdqa %xmm1, %xmm8 1349; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] 1350; SSSE3-NEXT: pmullw %xmm7, %xmm8 1351; SSSE3-NEXT: movdqa %xmm8, %xmm7 1352; SSSE3-NEXT: pand %xmm4, %xmm7 1353; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1354; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1355; SSSE3-NEXT: pmullw %xmm3, %xmm1 1356; SSSE3-NEXT: pand %xmm1, %xmm4 1357; SSSE3-NEXT: packuswb %xmm7, %xmm4 1358; SSSE3-NEXT: psrlw $8, %xmm8 1359; SSSE3-NEXT: psrlw $8, %xmm1 1360; SSSE3-NEXT: packuswb %xmm8, %xmm1 1361; SSSE3-NEXT: pcmpeqb %xmm5, %xmm1 1362; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1363; SSSE3-NEXT: pxor %xmm3, %xmm1 1364; SSSE3-NEXT: psrlw $8, %xmm6 1365; SSSE3-NEXT: psrlw $8, %xmm0 1366; SSSE3-NEXT: packuswb %xmm6, %xmm0 1367; SSSE3-NEXT: pcmpeqb %xmm5, %xmm0 1368; SSSE3-NEXT: pxor %xmm3, %xmm0 1369; SSSE3-NEXT: movdqa %xmm0, %xmm3 1370; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1371; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1372; SSSE3-NEXT: pslld $31, %xmm3 1373; SSSE3-NEXT: psrad $31, %xmm3 1374; SSSE3-NEXT: movdqa %xmm0, %xmm5 1375; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1376; SSSE3-NEXT: movdqa %xmm5, %xmm6 1377; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] 1378; SSSE3-NEXT: pslld $31, %xmm6 1379; SSSE3-NEXT: psrad $31, %xmm6 1380; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1381; SSSE3-NEXT: pslld $31, %xmm5 1382; SSSE3-NEXT: psrad $31, %xmm5 1383; SSSE3-NEXT: movdqa %xmm1, %xmm7 1384; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1385; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1386; SSSE3-NEXT: pslld $31, %xmm7 1387; SSSE3-NEXT: psrad $31, %xmm7 1388; SSSE3-NEXT: movdqa %xmm1, %xmm8 1389; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1390; SSSE3-NEXT: movdqa %xmm8, %xmm9 1391; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 1392; SSSE3-NEXT: pslld $31, %xmm9 1393; SSSE3-NEXT: psrad $31, %xmm9 1394; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 1395; SSSE3-NEXT: pslld $31, %xmm8 1396; SSSE3-NEXT: psrad $31, %xmm8 1397; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1398; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1399; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1400; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1401; SSSE3-NEXT: movdqa %xmm4, 16(%rsi) 1402; SSSE3-NEXT: movdqa %xmm2, (%rsi) 1403; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) 1404; SSSE3-NEXT: movdqa %xmm0, (%rdi) 1405; SSSE3-NEXT: movdqa %xmm8, 112(%rdi) 1406; SSSE3-NEXT: movdqa %xmm9, 96(%rdi) 1407; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) 1408; SSSE3-NEXT: movdqa %xmm5, 48(%rdi) 1409; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) 1410; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) 1411; SSSE3-NEXT: retq 1412; 1413; SSE41-LABEL: umulo_v32i8: 1414; SSE41: # %bb.0: 1415; SSE41-NEXT: movq %rdi, %rax 1416; SSE41-NEXT: pxor %xmm7, %xmm7 1417; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1418; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 1419; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1420; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] 1421; SSE41-NEXT: pmullw %xmm2, %xmm0 1422; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1423; SSE41-NEXT: movdqa %xmm0, %xmm6 1424; SSE41-NEXT: pand %xmm2, %xmm6 1425; SSE41-NEXT: pmullw %xmm5, %xmm4 1426; SSE41-NEXT: movdqa %xmm4, %xmm5 1427; SSE41-NEXT: pand %xmm2, %xmm5 1428; SSE41-NEXT: packuswb %xmm6, %xmm5 1429; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1430; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] 1431; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1432; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] 1433; SSE41-NEXT: pmullw %xmm3, %xmm1 1434; SSE41-NEXT: movdqa %xmm1, %xmm3 1435; SSE41-NEXT: pand %xmm2, %xmm3 1436; SSE41-NEXT: pmullw %xmm8, %xmm6 1437; SSE41-NEXT: pand %xmm6, %xmm2 1438; SSE41-NEXT: packuswb %xmm3, %xmm2 1439; SSE41-NEXT: psrlw $8, %xmm1 1440; SSE41-NEXT: psrlw $8, %xmm6 1441; SSE41-NEXT: packuswb %xmm1, %xmm6 1442; SSE41-NEXT: pcmpeqb %xmm7, %xmm6 1443; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1444; SSE41-NEXT: pxor %xmm1, %xmm6 1445; SSE41-NEXT: psrlw $8, %xmm0 1446; SSE41-NEXT: psrlw $8, %xmm4 1447; SSE41-NEXT: packuswb %xmm0, %xmm4 1448; SSE41-NEXT: pcmpeqb %xmm7, %xmm4 1449; SSE41-NEXT: pxor %xmm1, %xmm4 1450; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 1451; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1452; SSE41-NEXT: pslld $31, %xmm0 1453; SSE41-NEXT: psrad $31, %xmm0 1454; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 1455; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1456; SSE41-NEXT: pslld $31, %xmm1 1457; SSE41-NEXT: psrad $31, %xmm1 1458; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] 1459; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1460; SSE41-NEXT: pslld $31, %xmm3 1461; SSE41-NEXT: psrad $31, %xmm3 1462; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] 1463; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 1464; SSE41-NEXT: pslld $31, %xmm7 1465; SSE41-NEXT: psrad $31, %xmm7 1466; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] 1467; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 1468; SSE41-NEXT: pslld $31, %xmm8 1469; SSE41-NEXT: psrad $31, %xmm8 1470; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] 1471; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 1472; SSE41-NEXT: pslld $31, %xmm9 1473; SSE41-NEXT: psrad $31, %xmm9 1474; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 1475; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 1476; SSE41-NEXT: movdqa %xmm2, 16(%rsi) 1477; SSE41-NEXT: movdqa %xmm5, (%rsi) 1478; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1479; SSE41-NEXT: movdqa %xmm4, (%rdi) 1480; SSE41-NEXT: movdqa %xmm9, 112(%rdi) 1481; SSE41-NEXT: movdqa %xmm8, 96(%rdi) 1482; SSE41-NEXT: movdqa %xmm7, 80(%rdi) 1483; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 1484; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 1485; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1486; SSE41-NEXT: retq 1487; 1488; AVX1-LABEL: umulo_v32i8: 1489; AVX1: # %bb.0: 1490; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1491; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1492; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1493; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1494; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1495; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm4 1496; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1497; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1498; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 1499; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm7 1500; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm4 1501; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1502; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1503; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1504; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1505; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 1506; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 1507; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1508; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1509; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1510; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm1 1511; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm5 1512; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm1 1513; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1514; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1515; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1516; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1517; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm7 1518; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm0 1519; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm3 1520; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 1521; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1522; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1523; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1524; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1525; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1526; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1527; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2 1528; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] 1529; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1530; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1531; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1532; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1533; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1534; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1535; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1536; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] 1537; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1538; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] 1539; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 1540; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 1541; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) 1542; AVX1-NEXT: vmovdqa %xmm4, (%rdi) 1543; AVX1-NEXT: retq 1544; 1545; AVX2-LABEL: umulo_v32i8: 1546; AVX2: # %bb.0: 1547; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1548; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 1549; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1550; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 1551; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1552; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 1553; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 1554; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1555; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1556; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm1 1557; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm4 1558; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm1 1559; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1560; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1561; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 1562; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1563; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 1564; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1565; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1566; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 1567; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1568; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1569; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1570; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 1571; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1572; AVX2-NEXT: retq 1573; 1574; AVX512F-LABEL: umulo_v32i8: 1575; AVX512F: # %bb.0: 1576; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 1577; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 1578; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 1579; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 1580; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 1581; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm3 1582; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1583; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 1584; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1585; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1586; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm3 1587; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm0 1588; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1589; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 1590; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 1591; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 1592; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1593; AVX512F-NEXT: vpmovdb %zmm2, 16(%rdi) 1594; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1595; AVX512F-NEXT: vpmovdb %zmm2, (%rdi) 1596; AVX512F-NEXT: retq 1597; 1598; AVX512BW-LABEL: umulo_v32i8: 1599; AVX512BW: # %bb.0: 1600; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1601; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1602; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 1603; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm0 1604; AVX512BW-NEXT: vptestmw %zmm0, %zmm0, %k1 1605; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 1606; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 1607; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 1608; AVX512BW-NEXT: vpmovwb %zmm2, (%rdi) 1609; AVX512BW-NEXT: retq 1610 %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1611 %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0 1612 %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1 1613 %res = sext <32 x i1> %obit to <32 x i32> 1614 store <32 x i8> %val, ptr %p2 1615 ret <32 x i32> %res 1616} 1617 1618define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { 1619; SSE2-LABEL: umulo_v64i8: 1620; SSE2: # %bb.0: 1621; SSE2-NEXT: movq %rdi, %rax 1622; SSE2-NEXT: pxor %xmm10, %xmm10 1623; SSE2-NEXT: movdqa %xmm4, %xmm8 1624; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] 1625; SSE2-NEXT: movdqa %xmm0, %xmm11 1626; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 1627; SSE2-NEXT: pmullw %xmm8, %xmm11 1628; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1629; SSE2-NEXT: movdqa %xmm11, %xmm9 1630; SSE2-NEXT: pand %xmm8, %xmm9 1631; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 1632; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1633; SSE2-NEXT: pmullw %xmm4, %xmm0 1634; SSE2-NEXT: movdqa %xmm0, %xmm4 1635; SSE2-NEXT: pand %xmm8, %xmm4 1636; SSE2-NEXT: packuswb %xmm9, %xmm4 1637; SSE2-NEXT: movdqa %xmm5, %xmm9 1638; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 1639; SSE2-NEXT: movdqa %xmm1, %xmm12 1640; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] 1641; SSE2-NEXT: pmullw %xmm9, %xmm12 1642; SSE2-NEXT: movdqa %xmm12, %xmm9 1643; SSE2-NEXT: pand %xmm8, %xmm9 1644; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] 1645; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1646; SSE2-NEXT: pmullw %xmm5, %xmm1 1647; SSE2-NEXT: movdqa %xmm1, %xmm5 1648; SSE2-NEXT: pand %xmm8, %xmm5 1649; SSE2-NEXT: packuswb %xmm9, %xmm5 1650; SSE2-NEXT: movdqa %xmm6, %xmm9 1651; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 1652; SSE2-NEXT: movdqa %xmm2, %xmm13 1653; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] 1654; SSE2-NEXT: pmullw %xmm9, %xmm13 1655; SSE2-NEXT: movdqa %xmm13, %xmm14 1656; SSE2-NEXT: pand %xmm8, %xmm14 1657; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 1658; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 1659; SSE2-NEXT: pmullw %xmm6, %xmm2 1660; SSE2-NEXT: movdqa %xmm2, %xmm9 1661; SSE2-NEXT: pand %xmm8, %xmm9 1662; SSE2-NEXT: packuswb %xmm14, %xmm9 1663; SSE2-NEXT: movdqa %xmm7, %xmm6 1664; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 1665; SSE2-NEXT: movdqa %xmm3, %xmm14 1666; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] 1667; SSE2-NEXT: pmullw %xmm6, %xmm14 1668; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 1669; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 1670; SSE2-NEXT: pmullw %xmm7, %xmm3 1671; SSE2-NEXT: movdqa %xmm14, %xmm6 1672; SSE2-NEXT: pand %xmm8, %xmm6 1673; SSE2-NEXT: pand %xmm3, %xmm8 1674; SSE2-NEXT: packuswb %xmm6, %xmm8 1675; SSE2-NEXT: psrlw $8, %xmm14 1676; SSE2-NEXT: psrlw $8, %xmm3 1677; SSE2-NEXT: packuswb %xmm14, %xmm3 1678; SSE2-NEXT: psrlw $8, %xmm13 1679; SSE2-NEXT: psrlw $8, %xmm2 1680; SSE2-NEXT: packuswb %xmm13, %xmm2 1681; SSE2-NEXT: psrlw $8, %xmm12 1682; SSE2-NEXT: psrlw $8, %xmm1 1683; SSE2-NEXT: packuswb %xmm12, %xmm1 1684; SSE2-NEXT: psrlw $8, %xmm11 1685; SSE2-NEXT: psrlw $8, %xmm0 1686; SSE2-NEXT: packuswb %xmm11, %xmm0 1687; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 1688; SSE2-NEXT: pcmpeqb %xmm10, %xmm2 1689; SSE2-NEXT: pcmpeqb %xmm10, %xmm1 1690; SSE2-NEXT: pcmpeqb %xmm10, %xmm0 1691; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 1692; SSE2-NEXT: pxor %xmm6, %xmm3 1693; SSE2-NEXT: pxor %xmm6, %xmm2 1694; SSE2-NEXT: pxor %xmm6, %xmm1 1695; SSE2-NEXT: pxor %xmm6, %xmm0 1696; SSE2-NEXT: movdqa %xmm0, %xmm6 1697; SSE2-NEXT: movdqa %xmm8, 48(%rsi) 1698; SSE2-NEXT: movdqa %xmm1, %xmm7 1699; SSE2-NEXT: movdqa %xmm9, 32(%rsi) 1700; SSE2-NEXT: movdqa %xmm2, %xmm8 1701; SSE2-NEXT: movdqa %xmm5, 16(%rsi) 1702; SSE2-NEXT: movdqa %xmm3, %xmm5 1703; SSE2-NEXT: movdqa %xmm4, (%rsi) 1704; SSE2-NEXT: movdqa %xmm3, %xmm4 1705; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1706; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1707; SSE2-NEXT: movdqa %xmm3, 192(%rdi) 1708; SSE2-NEXT: movdqa %xmm2, %xmm3 1709; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1710; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1711; SSE2-NEXT: movdqa %xmm2, 128(%rdi) 1712; SSE2-NEXT: movdqa %xmm1, %xmm2 1713; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1714; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1715; SSE2-NEXT: movdqa %xmm1, 64(%rdi) 1716; SSE2-NEXT: movdqa %xmm0, %xmm1 1717; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1718; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1719; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1720; SSE2-NEXT: movdqa %xmm0, (%rdi) 1721; SSE2-NEXT: movdqa %xmm4, %xmm0 1722; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1723; SSE2-NEXT: pslld $31, %xmm4 1724; SSE2-NEXT: psrad $31, %xmm4 1725; SSE2-NEXT: movdqa %xmm4, 224(%rdi) 1726; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1727; SSE2-NEXT: pslld $31, %xmm0 1728; SSE2-NEXT: psrad $31, %xmm0 1729; SSE2-NEXT: movdqa %xmm0, 240(%rdi) 1730; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1731; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1732; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1733; SSE2-NEXT: pslld $31, %xmm5 1734; SSE2-NEXT: psrad $31, %xmm5 1735; SSE2-NEXT: movdqa %xmm5, 208(%rdi) 1736; SSE2-NEXT: movdqa %xmm3, %xmm0 1737; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1738; SSE2-NEXT: pslld $31, %xmm3 1739; SSE2-NEXT: psrad $31, %xmm3 1740; SSE2-NEXT: movdqa %xmm3, 160(%rdi) 1741; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1742; SSE2-NEXT: pslld $31, %xmm0 1743; SSE2-NEXT: psrad $31, %xmm0 1744; SSE2-NEXT: movdqa %xmm0, 176(%rdi) 1745; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1746; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1747; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 1748; SSE2-NEXT: pslld $31, %xmm8 1749; SSE2-NEXT: psrad $31, %xmm8 1750; SSE2-NEXT: movdqa %xmm8, 144(%rdi) 1751; SSE2-NEXT: movdqa %xmm2, %xmm0 1752; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1753; SSE2-NEXT: pslld $31, %xmm2 1754; SSE2-NEXT: psrad $31, %xmm2 1755; SSE2-NEXT: movdqa %xmm2, 96(%rdi) 1756; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1757; SSE2-NEXT: pslld $31, %xmm0 1758; SSE2-NEXT: psrad $31, %xmm0 1759; SSE2-NEXT: movdqa %xmm0, 112(%rdi) 1760; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1761; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1762; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1763; SSE2-NEXT: pslld $31, %xmm7 1764; SSE2-NEXT: psrad $31, %xmm7 1765; SSE2-NEXT: movdqa %xmm7, 80(%rdi) 1766; SSE2-NEXT: movdqa %xmm1, %xmm0 1767; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1768; SSE2-NEXT: pslld $31, %xmm1 1769; SSE2-NEXT: psrad $31, %xmm1 1770; SSE2-NEXT: movdqa %xmm1, 32(%rdi) 1771; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1772; SSE2-NEXT: pslld $31, %xmm0 1773; SSE2-NEXT: psrad $31, %xmm0 1774; SSE2-NEXT: movdqa %xmm0, 48(%rdi) 1775; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1776; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1777; SSE2-NEXT: pslld $31, %xmm6 1778; SSE2-NEXT: psrad $31, %xmm6 1779; SSE2-NEXT: movdqa %xmm6, 16(%rdi) 1780; SSE2-NEXT: retq 1781; 1782; SSSE3-LABEL: umulo_v64i8: 1783; SSSE3: # %bb.0: 1784; SSSE3-NEXT: movq %rdi, %rax 1785; SSSE3-NEXT: pxor %xmm10, %xmm10 1786; SSSE3-NEXT: movdqa %xmm4, %xmm8 1787; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] 1788; SSSE3-NEXT: movdqa %xmm0, %xmm11 1789; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 1790; SSSE3-NEXT: pmullw %xmm8, %xmm11 1791; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1792; SSSE3-NEXT: movdqa %xmm11, %xmm9 1793; SSSE3-NEXT: pand %xmm8, %xmm9 1794; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 1795; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1796; SSSE3-NEXT: pmullw %xmm4, %xmm0 1797; SSSE3-NEXT: movdqa %xmm0, %xmm4 1798; SSSE3-NEXT: pand %xmm8, %xmm4 1799; SSSE3-NEXT: packuswb %xmm9, %xmm4 1800; SSSE3-NEXT: movdqa %xmm5, %xmm9 1801; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 1802; SSSE3-NEXT: movdqa %xmm1, %xmm12 1803; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] 1804; SSSE3-NEXT: pmullw %xmm9, %xmm12 1805; SSSE3-NEXT: movdqa %xmm12, %xmm9 1806; SSSE3-NEXT: pand %xmm8, %xmm9 1807; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] 1808; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1809; SSSE3-NEXT: pmullw %xmm5, %xmm1 1810; SSSE3-NEXT: movdqa %xmm1, %xmm5 1811; SSSE3-NEXT: pand %xmm8, %xmm5 1812; SSSE3-NEXT: packuswb %xmm9, %xmm5 1813; SSSE3-NEXT: movdqa %xmm6, %xmm9 1814; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 1815; SSSE3-NEXT: movdqa %xmm2, %xmm13 1816; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] 1817; SSSE3-NEXT: pmullw %xmm9, %xmm13 1818; SSSE3-NEXT: movdqa %xmm13, %xmm14 1819; SSSE3-NEXT: pand %xmm8, %xmm14 1820; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 1821; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 1822; SSSE3-NEXT: pmullw %xmm6, %xmm2 1823; SSSE3-NEXT: movdqa %xmm2, %xmm9 1824; SSSE3-NEXT: pand %xmm8, %xmm9 1825; SSSE3-NEXT: packuswb %xmm14, %xmm9 1826; SSSE3-NEXT: movdqa %xmm7, %xmm6 1827; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 1828; SSSE3-NEXT: movdqa %xmm3, %xmm14 1829; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] 1830; SSSE3-NEXT: pmullw %xmm6, %xmm14 1831; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 1832; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 1833; SSSE3-NEXT: pmullw %xmm7, %xmm3 1834; SSSE3-NEXT: movdqa %xmm14, %xmm6 1835; SSSE3-NEXT: pand %xmm8, %xmm6 1836; SSSE3-NEXT: pand %xmm3, %xmm8 1837; SSSE3-NEXT: packuswb %xmm6, %xmm8 1838; SSSE3-NEXT: psrlw $8, %xmm14 1839; SSSE3-NEXT: psrlw $8, %xmm3 1840; SSSE3-NEXT: packuswb %xmm14, %xmm3 1841; SSSE3-NEXT: psrlw $8, %xmm13 1842; SSSE3-NEXT: psrlw $8, %xmm2 1843; SSSE3-NEXT: packuswb %xmm13, %xmm2 1844; SSSE3-NEXT: psrlw $8, %xmm12 1845; SSSE3-NEXT: psrlw $8, %xmm1 1846; SSSE3-NEXT: packuswb %xmm12, %xmm1 1847; SSSE3-NEXT: psrlw $8, %xmm11 1848; SSSE3-NEXT: psrlw $8, %xmm0 1849; SSSE3-NEXT: packuswb %xmm11, %xmm0 1850; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 1851; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 1852; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 1853; SSSE3-NEXT: pcmpeqb %xmm10, %xmm0 1854; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 1855; SSSE3-NEXT: pxor %xmm6, %xmm3 1856; SSSE3-NEXT: pxor %xmm6, %xmm2 1857; SSSE3-NEXT: pxor %xmm6, %xmm1 1858; SSSE3-NEXT: pxor %xmm6, %xmm0 1859; SSSE3-NEXT: movdqa %xmm0, %xmm6 1860; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) 1861; SSSE3-NEXT: movdqa %xmm1, %xmm7 1862; SSSE3-NEXT: movdqa %xmm9, 32(%rsi) 1863; SSSE3-NEXT: movdqa %xmm2, %xmm8 1864; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) 1865; SSSE3-NEXT: movdqa %xmm3, %xmm5 1866; SSSE3-NEXT: movdqa %xmm4, (%rsi) 1867; SSSE3-NEXT: movdqa %xmm3, %xmm4 1868; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1869; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1870; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) 1871; SSSE3-NEXT: movdqa %xmm2, %xmm3 1872; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1873; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1874; SSSE3-NEXT: movdqa %xmm2, 128(%rdi) 1875; SSSE3-NEXT: movdqa %xmm1, %xmm2 1876; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1877; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1878; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) 1879; SSSE3-NEXT: movdqa %xmm0, %xmm1 1880; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1881; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1882; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1883; SSSE3-NEXT: movdqa %xmm0, (%rdi) 1884; SSSE3-NEXT: movdqa %xmm4, %xmm0 1885; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1886; SSSE3-NEXT: pslld $31, %xmm4 1887; SSSE3-NEXT: psrad $31, %xmm4 1888; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) 1889; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1890; SSSE3-NEXT: pslld $31, %xmm0 1891; SSSE3-NEXT: psrad $31, %xmm0 1892; SSSE3-NEXT: movdqa %xmm0, 240(%rdi) 1893; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1894; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1895; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1896; SSSE3-NEXT: pslld $31, %xmm5 1897; SSSE3-NEXT: psrad $31, %xmm5 1898; SSSE3-NEXT: movdqa %xmm5, 208(%rdi) 1899; SSSE3-NEXT: movdqa %xmm3, %xmm0 1900; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1901; SSSE3-NEXT: pslld $31, %xmm3 1902; SSSE3-NEXT: psrad $31, %xmm3 1903; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) 1904; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1905; SSSE3-NEXT: pslld $31, %xmm0 1906; SSSE3-NEXT: psrad $31, %xmm0 1907; SSSE3-NEXT: movdqa %xmm0, 176(%rdi) 1908; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1909; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1910; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 1911; SSSE3-NEXT: pslld $31, %xmm8 1912; SSSE3-NEXT: psrad $31, %xmm8 1913; SSSE3-NEXT: movdqa %xmm8, 144(%rdi) 1914; SSSE3-NEXT: movdqa %xmm2, %xmm0 1915; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1916; SSSE3-NEXT: pslld $31, %xmm2 1917; SSSE3-NEXT: psrad $31, %xmm2 1918; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) 1919; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1920; SSSE3-NEXT: pslld $31, %xmm0 1921; SSSE3-NEXT: psrad $31, %xmm0 1922; SSSE3-NEXT: movdqa %xmm0, 112(%rdi) 1923; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1924; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1925; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1926; SSSE3-NEXT: pslld $31, %xmm7 1927; SSSE3-NEXT: psrad $31, %xmm7 1928; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) 1929; SSSE3-NEXT: movdqa %xmm1, %xmm0 1930; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1931; SSSE3-NEXT: pslld $31, %xmm1 1932; SSSE3-NEXT: psrad $31, %xmm1 1933; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) 1934; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1935; SSSE3-NEXT: pslld $31, %xmm0 1936; SSSE3-NEXT: psrad $31, %xmm0 1937; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) 1938; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1939; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1940; SSSE3-NEXT: pslld $31, %xmm6 1941; SSSE3-NEXT: psrad $31, %xmm6 1942; SSSE3-NEXT: movdqa %xmm6, 16(%rdi) 1943; SSSE3-NEXT: retq 1944; 1945; SSE41-LABEL: umulo_v64i8: 1946; SSE41: # %bb.0: 1947; SSE41-NEXT: movq %rdi, %rax 1948; SSE41-NEXT: pxor %xmm13, %xmm13 1949; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 1950; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] 1951; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1952; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 1953; SSE41-NEXT: pmullw %xmm4, %xmm0 1954; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1955; SSE41-NEXT: movdqa %xmm0, %xmm4 1956; SSE41-NEXT: pand %xmm9, %xmm4 1957; SSE41-NEXT: pmullw %xmm10, %xmm8 1958; SSE41-NEXT: movdqa %xmm8, %xmm10 1959; SSE41-NEXT: pand %xmm9, %xmm10 1960; SSE41-NEXT: packuswb %xmm4, %xmm10 1961; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 1962; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] 1963; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1964; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] 1965; SSE41-NEXT: pmullw %xmm5, %xmm1 1966; SSE41-NEXT: movdqa %xmm1, %xmm5 1967; SSE41-NEXT: pand %xmm9, %xmm5 1968; SSE41-NEXT: pmullw %xmm11, %xmm4 1969; SSE41-NEXT: movdqa %xmm4, %xmm11 1970; SSE41-NEXT: pand %xmm9, %xmm11 1971; SSE41-NEXT: packuswb %xmm5, %xmm11 1972; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 1973; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] 1974; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1975; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 1976; SSE41-NEXT: pmullw %xmm6, %xmm2 1977; SSE41-NEXT: movdqa %xmm2, %xmm6 1978; SSE41-NEXT: pand %xmm9, %xmm6 1979; SSE41-NEXT: pmullw %xmm12, %xmm5 1980; SSE41-NEXT: movdqa %xmm5, %xmm12 1981; SSE41-NEXT: pand %xmm9, %xmm12 1982; SSE41-NEXT: packuswb %xmm6, %xmm12 1983; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 1984; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] 1985; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1986; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 1987; SSE41-NEXT: pmullw %xmm7, %xmm3 1988; SSE41-NEXT: pmullw %xmm14, %xmm6 1989; SSE41-NEXT: movdqa %xmm3, %xmm7 1990; SSE41-NEXT: pand %xmm9, %xmm7 1991; SSE41-NEXT: pand %xmm6, %xmm9 1992; SSE41-NEXT: packuswb %xmm7, %xmm9 1993; SSE41-NEXT: psrlw $8, %xmm3 1994; SSE41-NEXT: psrlw $8, %xmm6 1995; SSE41-NEXT: packuswb %xmm3, %xmm6 1996; SSE41-NEXT: psrlw $8, %xmm2 1997; SSE41-NEXT: psrlw $8, %xmm5 1998; SSE41-NEXT: packuswb %xmm2, %xmm5 1999; SSE41-NEXT: psrlw $8, %xmm1 2000; SSE41-NEXT: psrlw $8, %xmm4 2001; SSE41-NEXT: packuswb %xmm1, %xmm4 2002; SSE41-NEXT: psrlw $8, %xmm0 2003; SSE41-NEXT: psrlw $8, %xmm8 2004; SSE41-NEXT: packuswb %xmm0, %xmm8 2005; SSE41-NEXT: pcmpeqb %xmm13, %xmm6 2006; SSE41-NEXT: pcmpeqb %xmm13, %xmm5 2007; SSE41-NEXT: pcmpeqb %xmm13, %xmm4 2008; SSE41-NEXT: pcmpeqb %xmm13, %xmm8 2009; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2010; SSE41-NEXT: pxor %xmm0, %xmm6 2011; SSE41-NEXT: pxor %xmm0, %xmm5 2012; SSE41-NEXT: pxor %xmm0, %xmm4 2013; SSE41-NEXT: pxor %xmm0, %xmm8 2014; SSE41-NEXT: movdqa %xmm9, 48(%rsi) 2015; SSE41-NEXT: movdqa %xmm12, 32(%rsi) 2016; SSE41-NEXT: movdqa %xmm11, 16(%rsi) 2017; SSE41-NEXT: movdqa %xmm10, (%rsi) 2018; SSE41-NEXT: pmovsxbd %xmm6, %xmm0 2019; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 2020; SSE41-NEXT: pmovsxbd %xmm5, %xmm0 2021; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 2022; SSE41-NEXT: pmovsxbd %xmm4, %xmm0 2023; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 2024; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 2025; SSE41-NEXT: movdqa %xmm0, (%rdi) 2026; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2027; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2028; SSE41-NEXT: pslld $31, %xmm0 2029; SSE41-NEXT: psrad $31, %xmm0 2030; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 2031; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2032; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2033; SSE41-NEXT: pslld $31, %xmm0 2034; SSE41-NEXT: psrad $31, %xmm0 2035; SSE41-NEXT: movdqa %xmm0, 240(%rdi) 2036; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2037; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2038; SSE41-NEXT: pslld $31, %xmm0 2039; SSE41-NEXT: psrad $31, %xmm0 2040; SSE41-NEXT: movdqa %xmm0, 208(%rdi) 2041; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2042; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2043; SSE41-NEXT: pslld $31, %xmm0 2044; SSE41-NEXT: psrad $31, %xmm0 2045; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 2046; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2047; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2048; SSE41-NEXT: pslld $31, %xmm0 2049; SSE41-NEXT: psrad $31, %xmm0 2050; SSE41-NEXT: movdqa %xmm0, 176(%rdi) 2051; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2052; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2053; SSE41-NEXT: pslld $31, %xmm0 2054; SSE41-NEXT: psrad $31, %xmm0 2055; SSE41-NEXT: movdqa %xmm0, 144(%rdi) 2056; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2057; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2058; SSE41-NEXT: pslld $31, %xmm0 2059; SSE41-NEXT: psrad $31, %xmm0 2060; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 2061; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 2062; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2063; SSE41-NEXT: pslld $31, %xmm0 2064; SSE41-NEXT: psrad $31, %xmm0 2065; SSE41-NEXT: movdqa %xmm0, 112(%rdi) 2066; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2067; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2068; SSE41-NEXT: pslld $31, %xmm0 2069; SSE41-NEXT: psrad $31, %xmm0 2070; SSE41-NEXT: movdqa %xmm0, 80(%rdi) 2071; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 2072; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2073; SSE41-NEXT: pslld $31, %xmm0 2074; SSE41-NEXT: psrad $31, %xmm0 2075; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 2076; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] 2077; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2078; SSE41-NEXT: pslld $31, %xmm0 2079; SSE41-NEXT: psrad $31, %xmm0 2080; SSE41-NEXT: movdqa %xmm0, 48(%rdi) 2081; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2082; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2083; SSE41-NEXT: pslld $31, %xmm0 2084; SSE41-NEXT: psrad $31, %xmm0 2085; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 2086; SSE41-NEXT: retq 2087; 2088; AVX1-LABEL: umulo_v64i8: 2089; AVX1: # %bb.0: 2090; AVX1-NEXT: movq %rdi, %rax 2091; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 2092; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2093; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2094; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 2095; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2096; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm4 2097; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2098; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2099; AVX1-NEXT: vpmullw %xmm8, %xmm9, %xmm8 2100; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm9 2101; AVX1-NEXT: vpackuswb %xmm4, %xmm9, %xmm4 2102; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2103; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2105; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2106; AVX1-NEXT: vpmullw %xmm9, %xmm10, %xmm9 2107; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm11 2108; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2109; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2110; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10 2111; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm0 2112; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0 2113; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2114; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2115; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11 2116; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm2 2117; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2118; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2119; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12 2120; AVX1-NEXT: vpand %xmm7, %xmm12, %xmm13 2121; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2 2122; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2123; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2124; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2125; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2126; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13 2127; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2128; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2129; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2130; AVX1-NEXT: vpand %xmm7, %xmm13, %xmm3 2131; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm7 2132; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm7 2133; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm3 2134; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2135; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 2136; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm3 2137; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11 2138; AVX1-NEXT: vpackuswb %xmm3, %xmm11, %xmm3 2139; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm9 2140; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 2141; AVX1-NEXT: vpackuswb %xmm9, %xmm10, %xmm9 2142; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 2143; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 2144; AVX1-NEXT: vpackuswb %xmm6, %xmm8, %xmm6 2145; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 2146; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 2147; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm8 2148; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm9 2149; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 2150; AVX1-NEXT: vpxor %xmm1, %xmm10, %xmm6 2151; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm5 2152; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm3 2153; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm1 2154; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) 2155; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi) 2156; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) 2157; AVX1-NEXT: vmovdqa %xmm4, (%rsi) 2158; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 2159; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) 2160; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 2161; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) 2162; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 2163; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) 2164; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 2165; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 2166; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2167; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2168; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) 2169; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2170; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2171; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) 2172; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2173; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2174; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) 2175; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2176; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2177; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) 2178; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2179; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2180; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) 2181; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2182; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2183; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) 2184; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 2185; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2186; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) 2187; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 2188; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2189; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) 2190; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 2191; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2192; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) 2193; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2194; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2195; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) 2196; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 2197; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2198; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) 2199; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 2200; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2201; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) 2202; AVX1-NEXT: vzeroupper 2203; AVX1-NEXT: retq 2204; 2205; AVX2-LABEL: umulo_v64i8: 2206; AVX2: # %bb.0: 2207; AVX2-NEXT: movq %rdi, %rax 2208; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 2209; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] 2210; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] 2211; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 2212; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2213; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 2214; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] 2215; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] 2216; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2 2217; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm0 2218; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0 2219; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] 2220; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] 2221; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2222; AVX2-NEXT: vpand %ymm6, %ymm7, %ymm8 2223; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] 2224; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] 2225; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 2226; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm3 2227; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm3 2228; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm6 2229; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2230; AVX2-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 2231; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 2232; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 2233; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 2234; AVX2-NEXT: vpsrlw $8, %ymm5, %ymm5 2235; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 2236; AVX2-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 2237; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 2238; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 2239; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] 2240; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 2241; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 2242; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 2243; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 2244; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 2245; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 2246; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 2247; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] 2248; AVX2-NEXT: vpmovsxbd %xmm9, %ymm9 2249; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 2250; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 2251; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 2252; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8 2253; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi) 2254; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 2255; AVX2-NEXT: vmovdqa %ymm8, 192(%rdi) 2256; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi) 2257; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 2258; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 2259; AVX2-NEXT: vmovdqa %ymm9, 224(%rdi) 2260; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) 2261; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) 2262; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi) 2263; AVX2-NEXT: vzeroupper 2264; AVX2-NEXT: retq 2265; 2266; AVX512F-LABEL: umulo_v64i8: 2267; AVX512F: # %bb.0: 2268; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2269; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 2270; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2271; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 2272; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 2273; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero 2274; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 2275; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 2276; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 2277; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 2278; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2279; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero 2280; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 2281; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 2282; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2283; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 2284; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 2285; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2286; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 2287; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2288; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 2289; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 2290; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2291; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k3 2292; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2293; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2294; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 2295; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm0 2296; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2297; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 2298; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 2299; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1 2300; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 2301; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 2302; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 2303; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) 2304; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero 2305; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) 2306; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero 2307; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) 2308; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero 2309; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) 2310; AVX512F-NEXT: retq 2311; 2312; AVX512BW-LABEL: umulo_v64i8: 2313; AVX512BW: # %bb.0: 2314; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2315; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 2316; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] 2317; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 2318; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2319; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 2320; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 2321; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] 2322; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2323; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm1 2324; AVX512BW-NEXT: vpackuswb %zmm5, %zmm1, %zmm4 2325; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm1 2326; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 2327; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 2328; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1 2329; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2330; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 2331; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 2332; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 2333; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 2334; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 2335; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 2336; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) 2337; AVX512BW-NEXT: retq 2338 %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1) 2339 %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0 2340 %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1 2341 %res = sext <64 x i1> %obit to <64 x i32> 2342 store <64 x i8> %val, ptr %p2 2343 ret <64 x i32> %res 2344} 2345 2346define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind { 2347; SSE2-LABEL: umulo_v8i16: 2348; SSE2: # %bb.0: 2349; SSE2-NEXT: movdqa %xmm0, %xmm2 2350; SSE2-NEXT: pmullw %xmm1, %xmm2 2351; SSE2-NEXT: pmulhuw %xmm0, %xmm1 2352; SSE2-NEXT: pxor %xmm0, %xmm0 2353; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 2354; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2355; SSE2-NEXT: pxor %xmm0, %xmm1 2356; SSE2-NEXT: movdqa %xmm1, %xmm0 2357; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2358; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2359; SSE2-NEXT: pslld $31, %xmm1 2360; SSE2-NEXT: psrad $31, %xmm1 2361; SSE2-NEXT: movdqa %xmm2, (%rdi) 2362; SSE2-NEXT: retq 2363; 2364; SSSE3-LABEL: umulo_v8i16: 2365; SSSE3: # %bb.0: 2366; SSSE3-NEXT: movdqa %xmm0, %xmm2 2367; SSSE3-NEXT: pmullw %xmm1, %xmm2 2368; SSSE3-NEXT: pmulhuw %xmm0, %xmm1 2369; SSSE3-NEXT: pxor %xmm0, %xmm0 2370; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1 2371; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 2372; SSSE3-NEXT: pxor %xmm0, %xmm1 2373; SSSE3-NEXT: movdqa %xmm1, %xmm0 2374; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2375; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2376; SSSE3-NEXT: pslld $31, %xmm1 2377; SSSE3-NEXT: psrad $31, %xmm1 2378; SSSE3-NEXT: movdqa %xmm2, (%rdi) 2379; SSSE3-NEXT: retq 2380; 2381; SSE41-LABEL: umulo_v8i16: 2382; SSE41: # %bb.0: 2383; SSE41-NEXT: movdqa %xmm0, %xmm2 2384; SSE41-NEXT: pmullw %xmm1, %xmm2 2385; SSE41-NEXT: pmulhuw %xmm0, %xmm1 2386; SSE41-NEXT: pxor %xmm0, %xmm0 2387; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 2388; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2389; SSE41-NEXT: pxor %xmm0, %xmm1 2390; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 2391; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2392; SSE41-NEXT: pslld $31, %xmm1 2393; SSE41-NEXT: psrad $31, %xmm1 2394; SSE41-NEXT: movdqa %xmm2, (%rdi) 2395; SSE41-NEXT: retq 2396; 2397; AVX1-LABEL: umulo_v8i16: 2398; AVX1: # %bb.0: 2399; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2400; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2401; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2402; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2403; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2404; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2405; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2406; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2407; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2408; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 2409; AVX1-NEXT: retq 2410; 2411; AVX2-LABEL: umulo_v8i16: 2412; AVX2: # %bb.0: 2413; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2414; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2415; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2416; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2417; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2418; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2419; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 2420; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 2421; AVX2-NEXT: retq 2422; 2423; AVX512F-LABEL: umulo_v8i16: 2424; AVX512F: # %bb.0: 2425; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2426; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2427; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2428; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2429; AVX512F-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 2430; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2431; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 2432; AVX512F-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2433; AVX512F-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2434; AVX512F-NEXT: vmovdqa %xmm2, (%rdi) 2435; AVX512F-NEXT: retq 2436; 2437; AVX512BW-LABEL: umulo_v8i16: 2438; AVX512BW: # %bb.0: 2439; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2440; AVX512BW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2441; AVX512BW-NEXT: vptestmw %xmm0, %xmm0, %k1 2442; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2443; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2444; AVX512BW-NEXT: vmovdqa %xmm2, (%rdi) 2445; AVX512BW-NEXT: retq 2446 %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2447 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 2448 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 2449 %res = sext <8 x i1> %obit to <8 x i32> 2450 store <8 x i16> %val, ptr %p2 2451 ret <8 x i32> %res 2452} 2453 2454define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { 2455; SSE2-LABEL: umulo_v2i64: 2456; SSE2: # %bb.0: 2457; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2458; SSE2-NEXT: movq %xmm2, %rcx 2459; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2460; SSE2-NEXT: movq %xmm2, %rsi 2461; SSE2-NEXT: movq %xmm0, %rax 2462; SSE2-NEXT: movq %xmm1, %rdx 2463; SSE2-NEXT: xorl %r8d, %r8d 2464; SSE2-NEXT: mulq %rdx 2465; SSE2-NEXT: movq $-1, %r9 2466; SSE2-NEXT: movl $0, %r10d 2467; SSE2-NEXT: cmovoq %r9, %r10 2468; SSE2-NEXT: movq %rax, %xmm1 2469; SSE2-NEXT: movq %rcx, %rax 2470; SSE2-NEXT: mulq %rsi 2471; SSE2-NEXT: movq %rax, %xmm0 2472; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2473; SSE2-NEXT: movq %r10, %xmm0 2474; SSE2-NEXT: cmovoq %r9, %r8 2475; SSE2-NEXT: movq %r8, %xmm2 2476; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2477; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2478; SSE2-NEXT: movdqa %xmm1, (%rdi) 2479; SSE2-NEXT: retq 2480; 2481; SSSE3-LABEL: umulo_v2i64: 2482; SSSE3: # %bb.0: 2483; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2484; SSSE3-NEXT: movq %xmm2, %rcx 2485; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2486; SSSE3-NEXT: movq %xmm2, %rsi 2487; SSSE3-NEXT: movq %xmm0, %rax 2488; SSSE3-NEXT: movq %xmm1, %rdx 2489; SSSE3-NEXT: xorl %r8d, %r8d 2490; SSSE3-NEXT: mulq %rdx 2491; SSSE3-NEXT: movq $-1, %r9 2492; SSSE3-NEXT: movl $0, %r10d 2493; SSSE3-NEXT: cmovoq %r9, %r10 2494; SSSE3-NEXT: movq %rax, %xmm1 2495; SSSE3-NEXT: movq %rcx, %rax 2496; SSSE3-NEXT: mulq %rsi 2497; SSSE3-NEXT: movq %rax, %xmm0 2498; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2499; SSSE3-NEXT: movq %r10, %xmm0 2500; SSSE3-NEXT: cmovoq %r9, %r8 2501; SSSE3-NEXT: movq %r8, %xmm2 2502; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2503; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2504; SSSE3-NEXT: movdqa %xmm1, (%rdi) 2505; SSSE3-NEXT: retq 2506; 2507; SSE41-LABEL: umulo_v2i64: 2508; SSE41: # %bb.0: 2509; SSE41-NEXT: movq %xmm0, %rcx 2510; SSE41-NEXT: movq %xmm1, %rsi 2511; SSE41-NEXT: pextrq $1, %xmm0, %rax 2512; SSE41-NEXT: pextrq $1, %xmm1, %rdx 2513; SSE41-NEXT: xorl %r8d, %r8d 2514; SSE41-NEXT: mulq %rdx 2515; SSE41-NEXT: movq $-1, %r9 2516; SSE41-NEXT: movl $0, %r10d 2517; SSE41-NEXT: cmovoq %r9, %r10 2518; SSE41-NEXT: movq %rax, %xmm0 2519; SSE41-NEXT: movq %rcx, %rax 2520; SSE41-NEXT: mulq %rsi 2521; SSE41-NEXT: movq %rax, %xmm1 2522; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2523; SSE41-NEXT: movq %r10, %xmm0 2524; SSE41-NEXT: cmovoq %r9, %r8 2525; SSE41-NEXT: movq %r8, %xmm2 2526; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2527; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2528; SSE41-NEXT: movdqa %xmm1, (%rdi) 2529; SSE41-NEXT: retq 2530; 2531; AVX-LABEL: umulo_v2i64: 2532; AVX: # %bb.0: 2533; AVX-NEXT: vmovq %xmm0, %rcx 2534; AVX-NEXT: vmovq %xmm1, %rsi 2535; AVX-NEXT: vpextrq $1, %xmm0, %rax 2536; AVX-NEXT: vpextrq $1, %xmm1, %rdx 2537; AVX-NEXT: xorl %r8d, %r8d 2538; AVX-NEXT: mulq %rdx 2539; AVX-NEXT: movq $-1, %r9 2540; AVX-NEXT: movl $0, %r10d 2541; AVX-NEXT: cmovoq %r9, %r10 2542; AVX-NEXT: vmovq %rax, %xmm0 2543; AVX-NEXT: movq %rcx, %rax 2544; AVX-NEXT: mulq %rsi 2545; AVX-NEXT: vmovq %rax, %xmm1 2546; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2547; AVX-NEXT: vmovq %r10, %xmm0 2548; AVX-NEXT: cmovoq %r9, %r8 2549; AVX-NEXT: vmovq %r8, %xmm2 2550; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2551; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2552; AVX-NEXT: vmovdqa %xmm1, (%rdi) 2553; AVX-NEXT: retq 2554; 2555; AVX512F-LABEL: umulo_v2i64: 2556; AVX512F: # %bb.0: 2557; AVX512F-NEXT: vmovq %xmm0, %rcx 2558; AVX512F-NEXT: vmovq %xmm1, %rsi 2559; AVX512F-NEXT: vpextrq $1, %xmm0, %rax 2560; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx 2561; AVX512F-NEXT: mulq %rdx 2562; AVX512F-NEXT: seto %r8b 2563; AVX512F-NEXT: vmovq %rax, %xmm0 2564; AVX512F-NEXT: movq %rcx, %rax 2565; AVX512F-NEXT: mulq %rsi 2566; AVX512F-NEXT: vmovq %rax, %xmm1 2567; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2568; AVX512F-NEXT: seto %al 2569; AVX512F-NEXT: andl $1, %eax 2570; AVX512F-NEXT: kmovw %eax, %k0 2571; AVX512F-NEXT: kmovw %r8d, %k1 2572; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2573; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2574; AVX512F-NEXT: korw %k1, %k0, %k1 2575; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2576; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2577; AVX512F-NEXT: vmovdqa %xmm1, (%rdi) 2578; AVX512F-NEXT: retq 2579; 2580; AVX512BW-LABEL: umulo_v2i64: 2581; AVX512BW: # %bb.0: 2582; AVX512BW-NEXT: vmovq %xmm0, %rcx 2583; AVX512BW-NEXT: vmovq %xmm1, %rsi 2584; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax 2585; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx 2586; AVX512BW-NEXT: mulq %rdx 2587; AVX512BW-NEXT: seto %r8b 2588; AVX512BW-NEXT: vmovq %rax, %xmm0 2589; AVX512BW-NEXT: movq %rcx, %rax 2590; AVX512BW-NEXT: mulq %rsi 2591; AVX512BW-NEXT: vmovq %rax, %xmm1 2592; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2593; AVX512BW-NEXT: seto %al 2594; AVX512BW-NEXT: andl $1, %eax 2595; AVX512BW-NEXT: kmovw %eax, %k0 2596; AVX512BW-NEXT: kmovd %r8d, %k1 2597; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2598; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 2599; AVX512BW-NEXT: korw %k1, %k0, %k1 2600; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2601; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2602; AVX512BW-NEXT: vmovdqa %xmm1, (%rdi) 2603; AVX512BW-NEXT: retq 2604 %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 2605 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 2606 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 2607 %res = sext <2 x i1> %obit to <2 x i32> 2608 store <2 x i64> %val, ptr %p2 2609 ret <2 x i32> %res 2610} 2611 2612define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { 2613; SSE2-LABEL: umulo_v4i24: 2614; SSE2: # %bb.0: 2615; SSE2-NEXT: movdqa %xmm0, %xmm2 2616; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2617; SSE2-NEXT: pand %xmm0, %xmm1 2618; SSE2-NEXT: pand %xmm0, %xmm2 2619; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2620; SSE2-NEXT: pmuludq %xmm1, %xmm2 2621; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2622; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2623; SSE2-NEXT: pmuludq %xmm0, %xmm1 2624; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2625; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2626; SSE2-NEXT: pxor %xmm4, %xmm4 2627; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 2628; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2629; SSE2-NEXT: pxor %xmm3, %xmm5 2630; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2631; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2632; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2633; SSE2-NEXT: psrld $24, %xmm0 2634; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 2635; SSE2-NEXT: por %xmm5, %xmm0 2636; SSE2-NEXT: movd %xmm2, %eax 2637; SSE2-NEXT: movw %ax, (%rdi) 2638; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2639; SSE2-NEXT: movd %xmm2, %ecx 2640; SSE2-NEXT: movw %cx, 6(%rdi) 2641; SSE2-NEXT: movd %xmm1, %edx 2642; SSE2-NEXT: movw %dx, 3(%rdi) 2643; SSE2-NEXT: shrl $16, %eax 2644; SSE2-NEXT: movb %al, 2(%rdi) 2645; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2646; SSE2-NEXT: movd %xmm1, %eax 2647; SSE2-NEXT: movw %ax, 9(%rdi) 2648; SSE2-NEXT: shrl $16, %ecx 2649; SSE2-NEXT: movb %cl, 8(%rdi) 2650; SSE2-NEXT: shrl $16, %edx 2651; SSE2-NEXT: movb %dl, 5(%rdi) 2652; SSE2-NEXT: shrl $16, %eax 2653; SSE2-NEXT: movb %al, 11(%rdi) 2654; SSE2-NEXT: retq 2655; 2656; SSSE3-LABEL: umulo_v4i24: 2657; SSSE3: # %bb.0: 2658; SSSE3-NEXT: movdqa %xmm0, %xmm2 2659; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2660; SSSE3-NEXT: pand %xmm0, %xmm1 2661; SSSE3-NEXT: pand %xmm0, %xmm2 2662; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2663; SSSE3-NEXT: pmuludq %xmm1, %xmm2 2664; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2665; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2666; SSSE3-NEXT: pmuludq %xmm0, %xmm1 2667; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2668; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2669; SSSE3-NEXT: pxor %xmm4, %xmm4 2670; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 2671; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 2672; SSSE3-NEXT: pxor %xmm3, %xmm5 2673; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2674; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2675; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2676; SSSE3-NEXT: psrld $24, %xmm0 2677; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 2678; SSSE3-NEXT: por %xmm5, %xmm0 2679; SSSE3-NEXT: movd %xmm2, %eax 2680; SSSE3-NEXT: movw %ax, (%rdi) 2681; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2682; SSSE3-NEXT: movd %xmm2, %ecx 2683; SSSE3-NEXT: movw %cx, 6(%rdi) 2684; SSSE3-NEXT: movd %xmm1, %edx 2685; SSSE3-NEXT: movw %dx, 3(%rdi) 2686; SSSE3-NEXT: shrl $16, %eax 2687; SSSE3-NEXT: movb %al, 2(%rdi) 2688; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2689; SSSE3-NEXT: movd %xmm1, %eax 2690; SSSE3-NEXT: movw %ax, 9(%rdi) 2691; SSSE3-NEXT: shrl $16, %ecx 2692; SSSE3-NEXT: movb %cl, 8(%rdi) 2693; SSSE3-NEXT: shrl $16, %edx 2694; SSSE3-NEXT: movb %dl, 5(%rdi) 2695; SSSE3-NEXT: shrl $16, %eax 2696; SSSE3-NEXT: movb %al, 11(%rdi) 2697; SSSE3-NEXT: retq 2698; 2699; SSE41-LABEL: umulo_v4i24: 2700; SSE41: # %bb.0: 2701; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2702; SSE41-NEXT: pand %xmm2, %xmm0 2703; SSE41-NEXT: pand %xmm2, %xmm1 2704; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2705; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2706; SSE41-NEXT: pmuludq %xmm2, %xmm3 2707; SSE41-NEXT: movdqa %xmm0, %xmm2 2708; SSE41-NEXT: pmuludq %xmm1, %xmm2 2709; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2710; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 2711; SSE41-NEXT: pxor %xmm3, %xmm3 2712; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 2713; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 2714; SSE41-NEXT: pxor %xmm2, %xmm4 2715; SSE41-NEXT: pmulld %xmm0, %xmm1 2716; SSE41-NEXT: pextrd $3, %xmm1, %eax 2717; SSE41-NEXT: pextrd $2, %xmm1, %ecx 2718; SSE41-NEXT: pextrd $1, %xmm1, %edx 2719; SSE41-NEXT: movd %xmm1, %esi 2720; SSE41-NEXT: movdqa %xmm1, %xmm0 2721; SSE41-NEXT: psrld $24, %xmm0 2722; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 2723; SSE41-NEXT: por %xmm4, %xmm0 2724; SSE41-NEXT: movw %ax, 9(%rdi) 2725; SSE41-NEXT: movw %cx, 6(%rdi) 2726; SSE41-NEXT: movw %dx, 3(%rdi) 2727; SSE41-NEXT: movw %si, (%rdi) 2728; SSE41-NEXT: shrl $16, %eax 2729; SSE41-NEXT: movb %al, 11(%rdi) 2730; SSE41-NEXT: shrl $16, %ecx 2731; SSE41-NEXT: movb %cl, 8(%rdi) 2732; SSE41-NEXT: shrl $16, %edx 2733; SSE41-NEXT: movb %dl, 5(%rdi) 2734; SSE41-NEXT: shrl $16, %esi 2735; SSE41-NEXT: movb %sil, 2(%rdi) 2736; SSE41-NEXT: retq 2737; 2738; AVX1-LABEL: umulo_v4i24: 2739; AVX1: # %bb.0: 2740; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2741; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 2742; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 2743; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3] 2744; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3] 2745; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2746; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2747; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2748; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 2749; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2750; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2751; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2752; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 2753; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2754; AVX1-NEXT: vpsrld $24, %xmm1, %xmm0 2755; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2756; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 2757; AVX1-NEXT: vpextrd $3, %xmm1, %eax 2758; AVX1-NEXT: movw %ax, 9(%rdi) 2759; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 2760; AVX1-NEXT: movw %cx, 6(%rdi) 2761; AVX1-NEXT: vpextrd $1, %xmm1, %edx 2762; AVX1-NEXT: movw %dx, 3(%rdi) 2763; AVX1-NEXT: vmovd %xmm1, %esi 2764; AVX1-NEXT: movw %si, (%rdi) 2765; AVX1-NEXT: shrl $16, %eax 2766; AVX1-NEXT: movb %al, 11(%rdi) 2767; AVX1-NEXT: shrl $16, %ecx 2768; AVX1-NEXT: movb %cl, 8(%rdi) 2769; AVX1-NEXT: shrl $16, %edx 2770; AVX1-NEXT: movb %dl, 5(%rdi) 2771; AVX1-NEXT: shrl $16, %esi 2772; AVX1-NEXT: movb %sil, 2(%rdi) 2773; AVX1-NEXT: retq 2774; 2775; AVX2-LABEL: umulo_v4i24: 2776; AVX2: # %bb.0: 2777; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2778; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2779; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 2780; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2781; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2782; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2783; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2784; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2785; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 2786; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 2787; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2788; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2789; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 2790; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2791; AVX2-NEXT: vpsrld $24, %xmm1, %xmm0 2792; AVX2-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2793; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 2794; AVX2-NEXT: vpextrd $3, %xmm1, %eax 2795; AVX2-NEXT: movw %ax, 9(%rdi) 2796; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2797; AVX2-NEXT: movw %cx, 6(%rdi) 2798; AVX2-NEXT: vpextrd $1, %xmm1, %edx 2799; AVX2-NEXT: movw %dx, 3(%rdi) 2800; AVX2-NEXT: vmovd %xmm1, %esi 2801; AVX2-NEXT: movw %si, (%rdi) 2802; AVX2-NEXT: shrl $16, %eax 2803; AVX2-NEXT: movb %al, 11(%rdi) 2804; AVX2-NEXT: shrl $16, %ecx 2805; AVX2-NEXT: movb %cl, 8(%rdi) 2806; AVX2-NEXT: shrl $16, %edx 2807; AVX2-NEXT: movb %dl, 5(%rdi) 2808; AVX2-NEXT: shrl $16, %esi 2809; AVX2-NEXT: movb %sil, 2(%rdi) 2810; AVX2-NEXT: retq 2811; 2812; AVX512-LABEL: umulo_v4i24: 2813; AVX512: # %bb.0: 2814; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2815; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 2816; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 2817; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 2818; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2819; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2820; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 2821; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] 2822; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 2823; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2824; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 2825; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0 2826; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 2827; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2828; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2829; AVX512-NEXT: vpextrd $3, %xmm1, %eax 2830; AVX512-NEXT: movw %ax, 9(%rdi) 2831; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 2832; AVX512-NEXT: movw %cx, 6(%rdi) 2833; AVX512-NEXT: vpextrd $1, %xmm1, %edx 2834; AVX512-NEXT: movw %dx, 3(%rdi) 2835; AVX512-NEXT: vmovd %xmm1, %esi 2836; AVX512-NEXT: movw %si, (%rdi) 2837; AVX512-NEXT: shrl $16, %eax 2838; AVX512-NEXT: movb %al, 11(%rdi) 2839; AVX512-NEXT: shrl $16, %ecx 2840; AVX512-NEXT: movb %cl, 8(%rdi) 2841; AVX512-NEXT: shrl $16, %edx 2842; AVX512-NEXT: movb %dl, 5(%rdi) 2843; AVX512-NEXT: shrl $16, %esi 2844; AVX512-NEXT: movb %sil, 2(%rdi) 2845; AVX512-NEXT: retq 2846 %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 2847 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 2848 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 2849 %res = sext <4 x i1> %obit to <4 x i32> 2850 store <4 x i24> %val, ptr %p2 2851 ret <4 x i32> %res 2852} 2853 2854define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { 2855; SSE-LABEL: umulo_v4i1: 2856; SSE: # %bb.0: 2857; SSE-NEXT: pand %xmm1, %xmm0 2858; SSE-NEXT: pslld $31, %xmm0 2859; SSE-NEXT: movmskps %xmm0, %eax 2860; SSE-NEXT: movb %al, (%rdi) 2861; SSE-NEXT: xorps %xmm0, %xmm0 2862; SSE-NEXT: retq 2863; 2864; AVX-LABEL: umulo_v4i1: 2865; AVX: # %bb.0: 2866; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 2867; AVX-NEXT: vpslld $31, %xmm0, %xmm0 2868; AVX-NEXT: vmovmskps %xmm0, %eax 2869; AVX-NEXT: movb %al, (%rdi) 2870; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 2871; AVX-NEXT: retq 2872; 2873; AVX512F-LABEL: umulo_v4i1: 2874; AVX512F: # %bb.0: 2875; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 2876; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2877; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k0 2878; AVX512F-NEXT: kmovw %k0, %eax 2879; AVX512F-NEXT: movb %al, (%rdi) 2880; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 2881; AVX512F-NEXT: retq 2882; 2883; AVX512BW-LABEL: umulo_v4i1: 2884; AVX512BW: # %bb.0: 2885; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 2886; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 2887; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0 2888; AVX512BW-NEXT: kmovd %k0, %eax 2889; AVX512BW-NEXT: movb %al, (%rdi) 2890; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 2891; AVX512BW-NEXT: retq 2892 %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 2893 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 2894 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 2895 %res = sext <4 x i1> %obit to <4 x i32> 2896 store <4 x i1> %val, ptr %p2 2897 ret <4 x i32> %res 2898} 2899 2900define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { 2901; SSE2-LABEL: umulo_v2i128: 2902; SSE2: # %bb.0: 2903; SSE2-NEXT: pushq %rbp 2904; SSE2-NEXT: pushq %r15 2905; SSE2-NEXT: pushq %r14 2906; SSE2-NEXT: pushq %r12 2907; SSE2-NEXT: pushq %rbx 2908; SSE2-NEXT: movq %r9, %r11 2909; SSE2-NEXT: movq %rcx, %r10 2910; SSE2-NEXT: movq %rdx, %rcx 2911; SSE2-NEXT: movq %rsi, %rax 2912; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx 2913; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 2914; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 2915; SSE2-NEXT: testq %r11, %r11 2916; SSE2-NEXT: setne %dl 2917; SSE2-NEXT: testq %rsi, %rsi 2918; SSE2-NEXT: setne %bpl 2919; SSE2-NEXT: andb %dl, %bpl 2920; SSE2-NEXT: mulq %r8 2921; SSE2-NEXT: movq %rax, %rsi 2922; SSE2-NEXT: seto %r15b 2923; SSE2-NEXT: movq %r11, %rax 2924; SSE2-NEXT: mulq %rdi 2925; SSE2-NEXT: seto %r12b 2926; SSE2-NEXT: orb %r15b, %r12b 2927; SSE2-NEXT: orb %bpl, %r12b 2928; SSE2-NEXT: leaq (%rsi,%rax), %r11 2929; SSE2-NEXT: movq %rdi, %rax 2930; SSE2-NEXT: mulq %r8 2931; SSE2-NEXT: movq %rax, %rdi 2932; SSE2-NEXT: movq %rdx, %rsi 2933; SSE2-NEXT: addq %r11, %rsi 2934; SSE2-NEXT: setb %r11b 2935; SSE2-NEXT: orb %r12b, %r11b 2936; SSE2-NEXT: testq %r9, %r9 2937; SSE2-NEXT: setne %al 2938; SSE2-NEXT: testq %r10, %r10 2939; SSE2-NEXT: setne %bpl 2940; SSE2-NEXT: andb %al, %bpl 2941; SSE2-NEXT: movq %r10, %rax 2942; SSE2-NEXT: mulq %r14 2943; SSE2-NEXT: movq %rax, %r8 2944; SSE2-NEXT: seto %r10b 2945; SSE2-NEXT: movq %r9, %rax 2946; SSE2-NEXT: mulq %rcx 2947; SSE2-NEXT: seto %r9b 2948; SSE2-NEXT: orb %r10b, %r9b 2949; SSE2-NEXT: orb %bpl, %r9b 2950; SSE2-NEXT: addq %rax, %r8 2951; SSE2-NEXT: movq %rcx, %rax 2952; SSE2-NEXT: mulq %r14 2953; SSE2-NEXT: addq %r8, %rdx 2954; SSE2-NEXT: setb %cl 2955; SSE2-NEXT: orb %r9b, %cl 2956; SSE2-NEXT: movzbl %cl, %ecx 2957; SSE2-NEXT: negl %ecx 2958; SSE2-NEXT: movd %ecx, %xmm1 2959; SSE2-NEXT: movzbl %r11b, %ecx 2960; SSE2-NEXT: negl %ecx 2961; SSE2-NEXT: movd %ecx, %xmm0 2962; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2963; SSE2-NEXT: movq %rax, 16(%rbx) 2964; SSE2-NEXT: movq %rdi, (%rbx) 2965; SSE2-NEXT: movq %rdx, 24(%rbx) 2966; SSE2-NEXT: movq %rsi, 8(%rbx) 2967; SSE2-NEXT: popq %rbx 2968; SSE2-NEXT: popq %r12 2969; SSE2-NEXT: popq %r14 2970; SSE2-NEXT: popq %r15 2971; SSE2-NEXT: popq %rbp 2972; SSE2-NEXT: retq 2973; 2974; SSSE3-LABEL: umulo_v2i128: 2975; SSSE3: # %bb.0: 2976; SSSE3-NEXT: pushq %rbp 2977; SSSE3-NEXT: pushq %r15 2978; SSSE3-NEXT: pushq %r14 2979; SSSE3-NEXT: pushq %r12 2980; SSSE3-NEXT: pushq %rbx 2981; SSSE3-NEXT: movq %r9, %r11 2982; SSSE3-NEXT: movq %rcx, %r10 2983; SSSE3-NEXT: movq %rdx, %rcx 2984; SSSE3-NEXT: movq %rsi, %rax 2985; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx 2986; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 2987; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 2988; SSSE3-NEXT: testq %r11, %r11 2989; SSSE3-NEXT: setne %dl 2990; SSSE3-NEXT: testq %rsi, %rsi 2991; SSSE3-NEXT: setne %bpl 2992; SSSE3-NEXT: andb %dl, %bpl 2993; SSSE3-NEXT: mulq %r8 2994; SSSE3-NEXT: movq %rax, %rsi 2995; SSSE3-NEXT: seto %r15b 2996; SSSE3-NEXT: movq %r11, %rax 2997; SSSE3-NEXT: mulq %rdi 2998; SSSE3-NEXT: seto %r12b 2999; SSSE3-NEXT: orb %r15b, %r12b 3000; SSSE3-NEXT: orb %bpl, %r12b 3001; SSSE3-NEXT: leaq (%rsi,%rax), %r11 3002; SSSE3-NEXT: movq %rdi, %rax 3003; SSSE3-NEXT: mulq %r8 3004; SSSE3-NEXT: movq %rax, %rdi 3005; SSSE3-NEXT: movq %rdx, %rsi 3006; SSSE3-NEXT: addq %r11, %rsi 3007; SSSE3-NEXT: setb %r11b 3008; SSSE3-NEXT: orb %r12b, %r11b 3009; SSSE3-NEXT: testq %r9, %r9 3010; SSSE3-NEXT: setne %al 3011; SSSE3-NEXT: testq %r10, %r10 3012; SSSE3-NEXT: setne %bpl 3013; SSSE3-NEXT: andb %al, %bpl 3014; SSSE3-NEXT: movq %r10, %rax 3015; SSSE3-NEXT: mulq %r14 3016; SSSE3-NEXT: movq %rax, %r8 3017; SSSE3-NEXT: seto %r10b 3018; SSSE3-NEXT: movq %r9, %rax 3019; SSSE3-NEXT: mulq %rcx 3020; SSSE3-NEXT: seto %r9b 3021; SSSE3-NEXT: orb %r10b, %r9b 3022; SSSE3-NEXT: orb %bpl, %r9b 3023; SSSE3-NEXT: addq %rax, %r8 3024; SSSE3-NEXT: movq %rcx, %rax 3025; SSSE3-NEXT: mulq %r14 3026; SSSE3-NEXT: addq %r8, %rdx 3027; SSSE3-NEXT: setb %cl 3028; SSSE3-NEXT: orb %r9b, %cl 3029; SSSE3-NEXT: movzbl %cl, %ecx 3030; SSSE3-NEXT: negl %ecx 3031; SSSE3-NEXT: movd %ecx, %xmm1 3032; SSSE3-NEXT: movzbl %r11b, %ecx 3033; SSSE3-NEXT: negl %ecx 3034; SSSE3-NEXT: movd %ecx, %xmm0 3035; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3036; SSSE3-NEXT: movq %rax, 16(%rbx) 3037; SSSE3-NEXT: movq %rdi, (%rbx) 3038; SSSE3-NEXT: movq %rdx, 24(%rbx) 3039; SSSE3-NEXT: movq %rsi, 8(%rbx) 3040; SSSE3-NEXT: popq %rbx 3041; SSSE3-NEXT: popq %r12 3042; SSSE3-NEXT: popq %r14 3043; SSSE3-NEXT: popq %r15 3044; SSSE3-NEXT: popq %rbp 3045; SSSE3-NEXT: retq 3046; 3047; SSE41-LABEL: umulo_v2i128: 3048; SSE41: # %bb.0: 3049; SSE41-NEXT: pushq %rbp 3050; SSE41-NEXT: pushq %r15 3051; SSE41-NEXT: pushq %r14 3052; SSE41-NEXT: pushq %r12 3053; SSE41-NEXT: pushq %rbx 3054; SSE41-NEXT: movq %r9, %r11 3055; SSE41-NEXT: movq %rcx, %r10 3056; SSE41-NEXT: movq %rdx, %rcx 3057; SSE41-NEXT: movq %rsi, %rax 3058; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx 3059; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 3060; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 3061; SSE41-NEXT: testq %r11, %r11 3062; SSE41-NEXT: setne %dl 3063; SSE41-NEXT: testq %rsi, %rsi 3064; SSE41-NEXT: setne %bpl 3065; SSE41-NEXT: andb %dl, %bpl 3066; SSE41-NEXT: mulq %r8 3067; SSE41-NEXT: movq %rax, %rsi 3068; SSE41-NEXT: seto %r15b 3069; SSE41-NEXT: movq %r11, %rax 3070; SSE41-NEXT: mulq %rdi 3071; SSE41-NEXT: seto %r12b 3072; SSE41-NEXT: orb %r15b, %r12b 3073; SSE41-NEXT: orb %bpl, %r12b 3074; SSE41-NEXT: leaq (%rsi,%rax), %r11 3075; SSE41-NEXT: movq %rdi, %rax 3076; SSE41-NEXT: mulq %r8 3077; SSE41-NEXT: movq %rax, %rdi 3078; SSE41-NEXT: movq %rdx, %rsi 3079; SSE41-NEXT: addq %r11, %rsi 3080; SSE41-NEXT: setb %r11b 3081; SSE41-NEXT: orb %r12b, %r11b 3082; SSE41-NEXT: testq %r9, %r9 3083; SSE41-NEXT: setne %al 3084; SSE41-NEXT: testq %r10, %r10 3085; SSE41-NEXT: setne %bpl 3086; SSE41-NEXT: andb %al, %bpl 3087; SSE41-NEXT: movq %r10, %rax 3088; SSE41-NEXT: mulq %r14 3089; SSE41-NEXT: movq %rax, %r8 3090; SSE41-NEXT: seto %r10b 3091; SSE41-NEXT: movq %r9, %rax 3092; SSE41-NEXT: mulq %rcx 3093; SSE41-NEXT: seto %r9b 3094; SSE41-NEXT: orb %r10b, %r9b 3095; SSE41-NEXT: orb %bpl, %r9b 3096; SSE41-NEXT: addq %rax, %r8 3097; SSE41-NEXT: movq %rcx, %rax 3098; SSE41-NEXT: mulq %r14 3099; SSE41-NEXT: addq %r8, %rdx 3100; SSE41-NEXT: setb %cl 3101; SSE41-NEXT: orb %r9b, %cl 3102; SSE41-NEXT: movzbl %cl, %ecx 3103; SSE41-NEXT: negl %ecx 3104; SSE41-NEXT: movzbl %r11b, %r8d 3105; SSE41-NEXT: negl %r8d 3106; SSE41-NEXT: movd %r8d, %xmm0 3107; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 3108; SSE41-NEXT: movq %rax, 16(%rbx) 3109; SSE41-NEXT: movq %rdi, (%rbx) 3110; SSE41-NEXT: movq %rdx, 24(%rbx) 3111; SSE41-NEXT: movq %rsi, 8(%rbx) 3112; SSE41-NEXT: popq %rbx 3113; SSE41-NEXT: popq %r12 3114; SSE41-NEXT: popq %r14 3115; SSE41-NEXT: popq %r15 3116; SSE41-NEXT: popq %rbp 3117; SSE41-NEXT: retq 3118; 3119; AVX-LABEL: umulo_v2i128: 3120; AVX: # %bb.0: 3121; AVX-NEXT: pushq %rbp 3122; AVX-NEXT: pushq %r15 3123; AVX-NEXT: pushq %r14 3124; AVX-NEXT: pushq %r12 3125; AVX-NEXT: pushq %rbx 3126; AVX-NEXT: movq %r9, %r11 3127; AVX-NEXT: movq %rcx, %r10 3128; AVX-NEXT: movq %rdx, %rcx 3129; AVX-NEXT: movq %rsi, %rax 3130; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx 3131; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 3132; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 3133; AVX-NEXT: testq %r11, %r11 3134; AVX-NEXT: setne %dl 3135; AVX-NEXT: testq %rsi, %rsi 3136; AVX-NEXT: setne %bpl 3137; AVX-NEXT: andb %dl, %bpl 3138; AVX-NEXT: mulq %r8 3139; AVX-NEXT: movq %rax, %rsi 3140; AVX-NEXT: seto %r15b 3141; AVX-NEXT: movq %r11, %rax 3142; AVX-NEXT: mulq %rdi 3143; AVX-NEXT: seto %r12b 3144; AVX-NEXT: orb %r15b, %r12b 3145; AVX-NEXT: orb %bpl, %r12b 3146; AVX-NEXT: leaq (%rsi,%rax), %r11 3147; AVX-NEXT: movq %rdi, %rax 3148; AVX-NEXT: mulq %r8 3149; AVX-NEXT: movq %rax, %rdi 3150; AVX-NEXT: movq %rdx, %rsi 3151; AVX-NEXT: addq %r11, %rsi 3152; AVX-NEXT: setb %r11b 3153; AVX-NEXT: orb %r12b, %r11b 3154; AVX-NEXT: testq %r9, %r9 3155; AVX-NEXT: setne %al 3156; AVX-NEXT: testq %r10, %r10 3157; AVX-NEXT: setne %bpl 3158; AVX-NEXT: andb %al, %bpl 3159; AVX-NEXT: movq %r10, %rax 3160; AVX-NEXT: mulq %r14 3161; AVX-NEXT: movq %rax, %r8 3162; AVX-NEXT: seto %r10b 3163; AVX-NEXT: movq %r9, %rax 3164; AVX-NEXT: mulq %rcx 3165; AVX-NEXT: seto %r9b 3166; AVX-NEXT: orb %r10b, %r9b 3167; AVX-NEXT: orb %bpl, %r9b 3168; AVX-NEXT: addq %rax, %r8 3169; AVX-NEXT: movq %rcx, %rax 3170; AVX-NEXT: mulq %r14 3171; AVX-NEXT: addq %r8, %rdx 3172; AVX-NEXT: setb %cl 3173; AVX-NEXT: orb %r9b, %cl 3174; AVX-NEXT: movzbl %cl, %ecx 3175; AVX-NEXT: negl %ecx 3176; AVX-NEXT: movzbl %r11b, %r8d 3177; AVX-NEXT: negl %r8d 3178; AVX-NEXT: vmovd %r8d, %xmm0 3179; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 3180; AVX-NEXT: movq %rax, 16(%rbx) 3181; AVX-NEXT: movq %rdi, (%rbx) 3182; AVX-NEXT: movq %rdx, 24(%rbx) 3183; AVX-NEXT: movq %rsi, 8(%rbx) 3184; AVX-NEXT: popq %rbx 3185; AVX-NEXT: popq %r12 3186; AVX-NEXT: popq %r14 3187; AVX-NEXT: popq %r15 3188; AVX-NEXT: popq %rbp 3189; AVX-NEXT: retq 3190; 3191; AVX512F-LABEL: umulo_v2i128: 3192; AVX512F: # %bb.0: 3193; AVX512F-NEXT: pushq %rbp 3194; AVX512F-NEXT: pushq %r15 3195; AVX512F-NEXT: pushq %r14 3196; AVX512F-NEXT: pushq %r12 3197; AVX512F-NEXT: pushq %rbx 3198; AVX512F-NEXT: movq %rcx, %rax 3199; AVX512F-NEXT: movq %rdx, %rcx 3200; AVX512F-NEXT: movq %rsi, %r10 3201; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx 3202; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 3203; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi 3204; AVX512F-NEXT: testq %rsi, %rsi 3205; AVX512F-NEXT: setne %dl 3206; AVX512F-NEXT: testq %rax, %rax 3207; AVX512F-NEXT: setne %bpl 3208; AVX512F-NEXT: andb %dl, %bpl 3209; AVX512F-NEXT: mulq %r14 3210; AVX512F-NEXT: movq %rax, %r11 3211; AVX512F-NEXT: seto %r15b 3212; AVX512F-NEXT: movq %rsi, %rax 3213; AVX512F-NEXT: mulq %rcx 3214; AVX512F-NEXT: seto %r12b 3215; AVX512F-NEXT: orb %r15b, %r12b 3216; AVX512F-NEXT: orb %bpl, %r12b 3217; AVX512F-NEXT: addq %rax, %r11 3218; AVX512F-NEXT: movq %rcx, %rax 3219; AVX512F-NEXT: mulq %r14 3220; AVX512F-NEXT: movq %rax, %rsi 3221; AVX512F-NEXT: movq %rdx, %rcx 3222; AVX512F-NEXT: addq %r11, %rcx 3223; AVX512F-NEXT: setb %al 3224; AVX512F-NEXT: orb %r12b, %al 3225; AVX512F-NEXT: kmovw %eax, %k0 3226; AVX512F-NEXT: testq %r9, %r9 3227; AVX512F-NEXT: setne %al 3228; AVX512F-NEXT: testq %r10, %r10 3229; AVX512F-NEXT: setne %r11b 3230; AVX512F-NEXT: andb %al, %r11b 3231; AVX512F-NEXT: movq %r10, %rax 3232; AVX512F-NEXT: mulq %r8 3233; AVX512F-NEXT: movq %rax, %r10 3234; AVX512F-NEXT: seto %bpl 3235; AVX512F-NEXT: movq %r9, %rax 3236; AVX512F-NEXT: mulq %rdi 3237; AVX512F-NEXT: seto %r9b 3238; AVX512F-NEXT: orb %bpl, %r9b 3239; AVX512F-NEXT: orb %r11b, %r9b 3240; AVX512F-NEXT: addq %rax, %r10 3241; AVX512F-NEXT: movq %rdi, %rax 3242; AVX512F-NEXT: mulq %r8 3243; AVX512F-NEXT: addq %r10, %rdx 3244; AVX512F-NEXT: setb %dil 3245; AVX512F-NEXT: orb %r9b, %dil 3246; AVX512F-NEXT: andl $1, %edi 3247; AVX512F-NEXT: kmovw %edi, %k1 3248; AVX512F-NEXT: kshiftlw $1, %k0, %k0 3249; AVX512F-NEXT: korw %k0, %k1, %k1 3250; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3251; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3252; AVX512F-NEXT: movq %rsi, 16(%rbx) 3253; AVX512F-NEXT: movq %rax, (%rbx) 3254; AVX512F-NEXT: movq %rcx, 24(%rbx) 3255; AVX512F-NEXT: movq %rdx, 8(%rbx) 3256; AVX512F-NEXT: popq %rbx 3257; AVX512F-NEXT: popq %r12 3258; AVX512F-NEXT: popq %r14 3259; AVX512F-NEXT: popq %r15 3260; AVX512F-NEXT: popq %rbp 3261; AVX512F-NEXT: retq 3262; 3263; AVX512BW-LABEL: umulo_v2i128: 3264; AVX512BW: # %bb.0: 3265; AVX512BW-NEXT: pushq %rbp 3266; AVX512BW-NEXT: pushq %r15 3267; AVX512BW-NEXT: pushq %r14 3268; AVX512BW-NEXT: pushq %r12 3269; AVX512BW-NEXT: pushq %rbx 3270; AVX512BW-NEXT: movq %rcx, %rax 3271; AVX512BW-NEXT: movq %rdx, %rcx 3272; AVX512BW-NEXT: movq %rsi, %r10 3273; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx 3274; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 3275; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi 3276; AVX512BW-NEXT: testq %rsi, %rsi 3277; AVX512BW-NEXT: setne %dl 3278; AVX512BW-NEXT: testq %rax, %rax 3279; AVX512BW-NEXT: setne %bpl 3280; AVX512BW-NEXT: andb %dl, %bpl 3281; AVX512BW-NEXT: mulq %r14 3282; AVX512BW-NEXT: movq %rax, %r11 3283; AVX512BW-NEXT: seto %r15b 3284; AVX512BW-NEXT: movq %rsi, %rax 3285; AVX512BW-NEXT: mulq %rcx 3286; AVX512BW-NEXT: seto %r12b 3287; AVX512BW-NEXT: orb %r15b, %r12b 3288; AVX512BW-NEXT: orb %bpl, %r12b 3289; AVX512BW-NEXT: addq %rax, %r11 3290; AVX512BW-NEXT: movq %rcx, %rax 3291; AVX512BW-NEXT: mulq %r14 3292; AVX512BW-NEXT: movq %rax, %rsi 3293; AVX512BW-NEXT: movq %rdx, %rcx 3294; AVX512BW-NEXT: addq %r11, %rcx 3295; AVX512BW-NEXT: setb %al 3296; AVX512BW-NEXT: orb %r12b, %al 3297; AVX512BW-NEXT: kmovd %eax, %k0 3298; AVX512BW-NEXT: testq %r9, %r9 3299; AVX512BW-NEXT: setne %al 3300; AVX512BW-NEXT: testq %r10, %r10 3301; AVX512BW-NEXT: setne %r11b 3302; AVX512BW-NEXT: andb %al, %r11b 3303; AVX512BW-NEXT: movq %r10, %rax 3304; AVX512BW-NEXT: mulq %r8 3305; AVX512BW-NEXT: movq %rax, %r10 3306; AVX512BW-NEXT: seto %bpl 3307; AVX512BW-NEXT: movq %r9, %rax 3308; AVX512BW-NEXT: mulq %rdi 3309; AVX512BW-NEXT: seto %r9b 3310; AVX512BW-NEXT: orb %bpl, %r9b 3311; AVX512BW-NEXT: orb %r11b, %r9b 3312; AVX512BW-NEXT: addq %rax, %r10 3313; AVX512BW-NEXT: movq %rdi, %rax 3314; AVX512BW-NEXT: mulq %r8 3315; AVX512BW-NEXT: addq %r10, %rdx 3316; AVX512BW-NEXT: setb %dil 3317; AVX512BW-NEXT: orb %r9b, %dil 3318; AVX512BW-NEXT: andl $1, %edi 3319; AVX512BW-NEXT: kmovw %edi, %k1 3320; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 3321; AVX512BW-NEXT: korw %k0, %k1, %k1 3322; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3323; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3324; AVX512BW-NEXT: movq %rsi, 16(%rbx) 3325; AVX512BW-NEXT: movq %rax, (%rbx) 3326; AVX512BW-NEXT: movq %rcx, 24(%rbx) 3327; AVX512BW-NEXT: movq %rdx, 8(%rbx) 3328; AVX512BW-NEXT: popq %rbx 3329; AVX512BW-NEXT: popq %r12 3330; AVX512BW-NEXT: popq %r14 3331; AVX512BW-NEXT: popq %r15 3332; AVX512BW-NEXT: popq %rbp 3333; AVX512BW-NEXT: retq 3334 %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 3335 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 3336 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 3337 %res = sext <2 x i1> %obit to <2 x i32> 3338 store <2 x i128> %val, ptr %p2 3339 ret <2 x i32> %res 3340} 3341