; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9 define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec16_v2i8_to_v1i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec16_v2i8_to_v1i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec16_v2i8_to_v1i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec16_v2i8_to_v1i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <2 x i32> %zextd.vec = shufflevector <2 x i8> %in.vec.trunc, <2 x i8> zeroinitializer, <2 x i32> %out.bytevec.padded = shufflevector <2 x i8> %zextd.vec, <2 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v4i8_to_v2i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec32_v4i8_to_v2i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec32_v4i8_to_v2i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec32_v4i8_to_v2i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v4i8_to_v1i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec32_v4i8_to_v1i32_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec32_v4i8_to_v1i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec32_v4i8_to_v1i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v2i16_to_v1i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec32_v2i16_to_v1i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec32_v2i16_to_v1i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec32_v2i16_to_v1i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> %in.vec.cast = bitcast <4 x i8> %in.vec.trunc to <2 x i16> %zextd.vec = shufflevector <2 x i16> %in.vec.cast, <2 x i16> zeroinitializer, <2 x i32> %out.bytevec = bitcast <2 x i16> %zextd.vec to <4 x i8> %out.bytevec.padded = shufflevector <4 x i8> %out.bytevec, <4 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v4i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v8i8_to_v4i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v8i8_to_v4i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v8i8_to_v4i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v2i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v8i8_to_v2i32_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v8i8_to_v2i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v8i8_to_v2i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v1i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v8i8_to_v1i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v8i8_to_v1i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v8i8_to_v1i64_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v4i16_to_v2i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v4i16_to_v2i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v4i16_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v4i16_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v4i16_to_v1i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v4i16_to_v1i64_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v4i16_to_v1i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v4i16_to_v1i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v2i32_to_v1i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec64_v2i32_to_v1i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec64_v2i32_to_v1i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec64_v2i32_to_v1i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <2 x i32> %zextd.vec = shufflevector <2 x i32> %in.vec.cast, <2 x i32> zeroinitializer, <2 x i32> %out.bytevec = bitcast <2 x i32> %zextd.vec to <8 x i8> %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v8i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v16i8_to_v8i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v16i8_to_v8i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v16i8_to_v8i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v4i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v16i8_to_v4i32_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v16i8_to_v4i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v16i8_to_v4i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v2i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v16i8_to_v2i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v16i8_to_v2i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v16i8_to_v2i64_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec128_v16i8_to_v1i128_factor16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v1i128_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v16i8_to_v1i128_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v16i8_to_v1i128_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v16i8_to_v1i128_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v4i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v8i16_to_v4i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v8i16_to_v4i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v8i16_to_v4i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v2i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v8i16_to_v2i64_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v8i16_to_v2i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v8i16_to_v2i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v1i128_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v8i16_to_v1i128_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v8i16_to_v1i128_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v8i16_to_v1i128_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v4i32_to_v2i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v4i32_to_v2i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v4i32_to_v2i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v4i32_to_v2i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v4i32_to_v1i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v4i32_to_v1i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v4i32_to_v1i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v4i32_to_v1i128_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec128_v2i64_to_v1i128_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec128_v2i64_to_v1i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec128_v2i64_to_v1i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_v2i64_to_v1i128_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec128_v2i64_to_v1i128_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <2 x i64> %zextd.vec = shufflevector <2 x i64> %in.vec.cast, <2 x i64> zeroinitializer, <2 x i32> %out.bytevec = bitcast <2 x i64> %zextd.vec to <16 x i8> %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v8i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v8i32_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v32i8_to_v8i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v32i8_to_v8i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v4i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v4i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v32i8_to_v4i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v32i8_to_v4i64_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v32i8_to_v2i128_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v2i128_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] ; SSE42-NEXT: pand %xmm0, %xmm1 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v2i128_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v32i8_to_v2i128_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v32i8_to_v2i128_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec256_v32i8_to_v1i256_factor32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v1i256_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v1i256_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v32i8_to_v1i256_factor32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v32i8_to_v1i256_factor32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v4i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v4i64_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v16i16_to_v4i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v16i16_to_v4i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v2i128_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v2i128_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v16i16_to_v2i128_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v16i16_to_v2i128_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v1i256_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $17, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $17, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v1i256_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v8i32_to_v1i256_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v8i32_to_v1i256_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v8i32_to_v1i256_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v4i64_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec256_v4i64_to_v2i128_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v4i64_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v4i64_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v4i64_to_v2i128_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $5, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v4i64_to_v2i128_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $5, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v4i64_to_v1i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec256_v4i64_to_v1i256_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v4i64_to_v1i256_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v4i64_to_v1i256_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v4i64_to_v1i256_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v4i64_to_v1i256_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec256_v2i128_to_v1i256_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v2i128_to_v1i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v2i128_to_v1i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec256_v2i128_to_v1i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec256_v2i128_to_v1i256_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <2 x i128> %zextd.vec = shufflevector <2 x i128> %in.vec.cast, <2 x i128> zeroinitializer, <2 x i32> %out.bytevec = bitcast <2 x i128> %zextd.vec to <32 x i8> %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: pxor %xmm3, %xmm3 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v24i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v24i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v3i128_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v3i128_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] ; SSE42-NEXT: pand %xmm0, %xmm1 ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v2i192_factor24: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v2i192_factor24: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v2i192_factor24: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v2i192_factor24: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec384_v48i8_to_v1i384_factor48: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v1i384_factor48: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v1i384_factor48: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v1i384_factor48: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v48i8_to_v1i384_factor48: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v12i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v24i16_to_v12i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7] ; SSE42-NEXT: paddb 16(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7] ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v2i192_factor12: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v24i16_to_v2i192_factor12: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v2i192_factor12: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v24i16_to_v2i192_factor12: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v12i32_to_v6i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v12i32_to_v6i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: movb $73, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: movb $73, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: movb $17, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: movb $17, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE2-NEXT: movaps 32(%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $65, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $65, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps 16(%rdx), %xmm0 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v1i384_factor12: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v12i32_to_v1i384_factor12: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v12i32_to_v1i384_factor12: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v12i32_to_v1i384_factor12: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec384_v6i64_to_v3i128_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm2 ; SSE-NEXT: paddb 32(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 32(%rcx) ; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v3i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v6i64_to_v3i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v6i64_to_v3i128_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $5, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v2i192_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v6i64_to_v2i192_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v6i64_to_v2i192_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $9, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v6i64_to_v2i192_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $9, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec384_v6i64_to_v1i384_factor6: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v1i384_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v6i64_to_v1i384_factor6: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v6i64_to_v1i384_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v6i64_to_v1i384_factor6: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec384_v3i128_to_v1i384_factor3: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <3 x i128> %zextd.vec = shufflevector <3 x i128> %in.vec.cast, <3 x i128> zeroinitializer, <3 x i32> %out.bytevec = bitcast <3 x i128> %zextd.vec to <48 x i8> %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE42-NEXT: pxor %xmm3, %xmm3 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 48(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v16i32_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v64i8_to_v16i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v64i8_to_v16i32_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: psrlq $48, %xmm3 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 48(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 48(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, 48(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] ; SSE42-NEXT: pand %xmm0, %xmm1 ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 48(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] ; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v64i8_to_v2i256_factor32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movaps 16(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm3, 48(%rcx) ; SSE2-NEXT: movaps %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v2i256_factor32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] ; SSE42-NEXT: pand %xmm0, %xmm1 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movaps 16(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm3, 48(%rcx) ; SSE42-NEXT: movaps %xmm2, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v2i256_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v2i256_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v64i8_to_v2i256_factor32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v64i8_to_v2i256_factor32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v64i8_to_v1i512_factor64: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v1i512_factor64: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v1i512_factor64: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v64i8_to_v1i512_factor64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v64i8_to_v1i512_factor64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 48(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm3 ; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movaps 16(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm3, 48(%rcx) ; SSE2-NEXT: movaps %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movaps 16(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm3, 48(%rcx) ; SSE42-NEXT: movaps %xmm2, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v2i256_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm4 ; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) ; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm4 ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm4, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pxor %xmm4, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: paddb 16(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 48(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 48(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: movb $17, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm3, 48(%rcx) ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm1 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm3, 48(%rcx) ; SSE42-NEXT: movaps %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movw $257, %ax # imm = 0x101 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps 16(%rdx), %xmm0 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v1i512_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v1i512_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v16i32_to_v1i512_factor16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v16i32_to_v1i512_factor16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v8i64_to_v4i128_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm3 ; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: paddb 32(%rdx), %xmm2 ; SSE-NEXT: movdqa %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm1, 48(%rcx) ; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $85, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: movb $5, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: movb $5, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7] ; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v8i64_to_v2i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v8i64_to_v2i256_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: movaps 16(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm1 ; SSE-NEXT: paddb 32(%rdx), %xmm0 ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, 32(%rcx) ; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $17, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v8i64_to_v1i512_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v8i64_to_v1i512_factor8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v8i64_to_v1i512_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v8i64_to_v1i512_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v8i64_to_v1i512_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v8i64_to_v1i512_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v4i128_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v4i128_to_v2i256_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb 16(%rsi), %xmm1 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: paddb 32(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movdqa %xmm1, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v4i128_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v4i128_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v4i128_to_v2i256_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v4i128_to_v1i512_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v4i128_to_v1i512_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE-LABEL: vec512_v2i256_to_v1i512_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: movdqa %xmm1, 16(%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %in.vec.cast = bitcast <64 x i8> %in.vec to <2 x i256> %zextd.vec = shufflevector <2 x i256> %in.vec.cast, <2 x i256> zeroinitializer, <2 x i32> %out.bytevec = bitcast <2 x i256> %zextd.vec to <64 x i8> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX1-ONLY: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK2: {{.*}} ; FALLBACK3: {{.*}} ; FALLBACK4: {{.*}} ; FALLBACK5: {{.*}} ; FALLBACK6: {{.*}} ; FALLBACK7: {{.*}} ; FALLBACK8: {{.*}} ; FALLBACK9: {{.*}}