1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9 12 13define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 14; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movdqa (%rdi), %xmm0 17; SSE2-NEXT: paddb (%rsi), %xmm0 18; SSE2-NEXT: pxor %xmm1, %xmm1 19; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 20; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 21; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 22; SSE2-NEXT: paddb (%rdx), %xmm0 23; SSE2-NEXT: movdqa %xmm0, (%rcx) 24; SSE2-NEXT: retq 25; 26; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2: 27; SSE42: # %bb.0: 28; SSE42-NEXT: movdqa (%rdi), %xmm0 29; SSE42-NEXT: paddb (%rsi), %xmm0 30; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 31; SSE42-NEXT: paddb (%rdx), %xmm0 32; SSE42-NEXT: movdqa %xmm0, (%rcx) 33; SSE42-NEXT: retq 34; 35; AVX-LABEL: vec16_v2i8_to_v1i16_factor2: 36; AVX: # %bb.0: 37; AVX-NEXT: vmovdqa (%rdi), %xmm0 38; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 39; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 40; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 41; AVX-NEXT: vmovdqa %xmm0, (%rcx) 42; AVX-NEXT: retq 43; 44; AVX2-LABEL: vec16_v2i8_to_v1i16_factor2: 45; AVX2: # %bb.0: 46; AVX2-NEXT: vmovdqa (%rdi), %xmm0 47; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 48; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 49; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 50; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 51; AVX2-NEXT: vzeroupper 52; AVX2-NEXT: retq 53; 54; AVX512F-LABEL: vec16_v2i8_to_v1i16_factor2: 55; AVX512F: # %bb.0: 56; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 57; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 58; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 59; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 60; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 61; AVX512F-NEXT: vzeroupper 62; AVX512F-NEXT: retq 63; 64; AVX512BW-LABEL: vec16_v2i8_to_v1i16_factor2: 65; AVX512BW: # %bb.0: 66; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 67; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 68; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 69; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 70; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 71; AVX512BW-NEXT: vzeroupper 72; AVX512BW-NEXT: retq 73 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 74 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 75 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 76 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <2 x i32> <i32 0, i32 1> 77 %zextd.vec = shufflevector <2 x i8> %in.vec.trunc, <2 x i8> zeroinitializer, <2 x i32> <i32 0, i32 3> 78 %out.bytevec.padded = shufflevector <2 x i8> %zextd.vec, <2 x i8> poison, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 79 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 80 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 81 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 82 ret void 83} 84 85define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 86; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2: 87; SSE2: # %bb.0: 88; SSE2-NEXT: movdqa (%rdi), %xmm0 89; SSE2-NEXT: paddb (%rsi), %xmm0 90; SSE2-NEXT: pxor %xmm1, %xmm1 91; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 92; SSE2-NEXT: paddb (%rdx), %xmm0 93; SSE2-NEXT: movdqa %xmm0, (%rcx) 94; SSE2-NEXT: retq 95; 96; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2: 97; SSE42: # %bb.0: 98; SSE42-NEXT: movdqa (%rdi), %xmm0 99; SSE42-NEXT: paddb (%rsi), %xmm0 100; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 101; SSE42-NEXT: paddb (%rdx), %xmm0 102; SSE42-NEXT: movdqa %xmm0, (%rcx) 103; SSE42-NEXT: retq 104; 105; AVX-LABEL: vec32_v4i8_to_v2i16_factor2: 106; AVX: # %bb.0: 107; AVX-NEXT: vmovdqa (%rdi), %xmm0 108; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 109; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 110; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 111; AVX-NEXT: vmovdqa %xmm0, (%rcx) 112; AVX-NEXT: retq 113; 114; AVX2-LABEL: vec32_v4i8_to_v2i16_factor2: 115; AVX2: # %bb.0: 116; AVX2-NEXT: vmovdqa (%rdi), %xmm0 117; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 118; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 119; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 120; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 121; AVX2-NEXT: vzeroupper 122; AVX2-NEXT: retq 123; 124; AVX512F-LABEL: vec32_v4i8_to_v2i16_factor2: 125; AVX512F: # %bb.0: 126; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 127; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 128; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 129; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 130; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 131; AVX512F-NEXT: vzeroupper 132; AVX512F-NEXT: retq 133; 134; AVX512BW-LABEL: vec32_v4i8_to_v2i16_factor2: 135; AVX512BW: # %bb.0: 136; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 137; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 138; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 139; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 140; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 141; AVX512BW-NEXT: vzeroupper 142; AVX512BW-NEXT: retq 143 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 144 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 145 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 146 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 147 %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 148 %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 149 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 150 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 151 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 152 ret void 153} 154 155define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 156; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4: 157; SSE2: # %bb.0: 158; SSE2-NEXT: movdqa (%rdi), %xmm0 159; SSE2-NEXT: paddb (%rsi), %xmm0 160; SSE2-NEXT: pxor %xmm1, %xmm1 161; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 162; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 163; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 164; SSE2-NEXT: paddb (%rdx), %xmm0 165; SSE2-NEXT: movdqa %xmm0, (%rcx) 166; SSE2-NEXT: retq 167; 168; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4: 169; SSE42: # %bb.0: 170; SSE42-NEXT: movdqa (%rdi), %xmm0 171; SSE42-NEXT: paddb (%rsi), %xmm0 172; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 173; SSE42-NEXT: paddb (%rdx), %xmm0 174; SSE42-NEXT: movdqa %xmm0, (%rcx) 175; SSE42-NEXT: retq 176; 177; AVX-LABEL: vec32_v4i8_to_v1i32_factor4: 178; AVX: # %bb.0: 179; AVX-NEXT: vmovdqa (%rdi), %xmm0 180; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 181; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 182; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 183; AVX-NEXT: vmovdqa %xmm0, (%rcx) 184; AVX-NEXT: retq 185; 186; AVX2-LABEL: vec32_v4i8_to_v1i32_factor4: 187; AVX2: # %bb.0: 188; AVX2-NEXT: vmovdqa (%rdi), %xmm0 189; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 190; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 191; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 192; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 193; AVX2-NEXT: vzeroupper 194; AVX2-NEXT: retq 195; 196; AVX512F-LABEL: vec32_v4i8_to_v1i32_factor4: 197; AVX512F: # %bb.0: 198; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 199; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 200; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 201; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 202; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 203; AVX512F-NEXT: vzeroupper 204; AVX512F-NEXT: retq 205; 206; AVX512BW-LABEL: vec32_v4i8_to_v1i32_factor4: 207; AVX512BW: # %bb.0: 208; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 209; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 210; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 211; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 212; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 213; AVX512BW-NEXT: vzeroupper 214; AVX512BW-NEXT: retq 215 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 216 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 217 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 218 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 219 %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 220 %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 221 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 222 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 223 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 224 ret void 225} 226 227define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 228; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2: 229; SSE2: # %bb.0: 230; SSE2-NEXT: movdqa (%rdi), %xmm0 231; SSE2-NEXT: paddb (%rsi), %xmm0 232; SSE2-NEXT: pxor %xmm1, %xmm1 233; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 234; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 235; SSE2-NEXT: paddb (%rdx), %xmm0 236; SSE2-NEXT: movdqa %xmm0, (%rcx) 237; SSE2-NEXT: retq 238; 239; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2: 240; SSE42: # %bb.0: 241; SSE42-NEXT: movdqa (%rdi), %xmm0 242; SSE42-NEXT: paddb (%rsi), %xmm0 243; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 244; SSE42-NEXT: paddb (%rdx), %xmm0 245; SSE42-NEXT: movdqa %xmm0, (%rcx) 246; SSE42-NEXT: retq 247; 248; AVX-LABEL: vec32_v2i16_to_v1i32_factor2: 249; AVX: # %bb.0: 250; AVX-NEXT: vmovdqa (%rdi), %xmm0 251; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 252; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 253; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 254; AVX-NEXT: vmovdqa %xmm0, (%rcx) 255; AVX-NEXT: retq 256; 257; AVX2-LABEL: vec32_v2i16_to_v1i32_factor2: 258; AVX2: # %bb.0: 259; AVX2-NEXT: vmovdqa (%rdi), %xmm0 260; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 261; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 262; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 263; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 264; AVX2-NEXT: vzeroupper 265; AVX2-NEXT: retq 266; 267; AVX512F-LABEL: vec32_v2i16_to_v1i32_factor2: 268; AVX512F: # %bb.0: 269; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 270; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 271; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 272; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 273; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 274; AVX512F-NEXT: vzeroupper 275; AVX512F-NEXT: retq 276; 277; AVX512BW-LABEL: vec32_v2i16_to_v1i32_factor2: 278; AVX512BW: # %bb.0: 279; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 280; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 281; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 282; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 283; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 284; AVX512BW-NEXT: vzeroupper 285; AVX512BW-NEXT: retq 286 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 287 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 288 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 289 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 290 %in.vec.cast = bitcast <4 x i8> %in.vec.trunc to <2 x i16> 291 %zextd.vec = shufflevector <2 x i16> %in.vec.cast, <2 x i16> zeroinitializer, <2 x i32> <i32 0, i32 3> 292 %out.bytevec = bitcast <2 x i16> %zextd.vec to <4 x i8> 293 %out.bytevec.padded = shufflevector <4 x i8> %out.bytevec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 294 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 295 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 296 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 297 ret void 298} 299 300define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 301; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2: 302; SSE2: # %bb.0: 303; SSE2-NEXT: movdqa (%rdi), %xmm0 304; SSE2-NEXT: paddb (%rsi), %xmm0 305; SSE2-NEXT: pxor %xmm1, %xmm1 306; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 307; SSE2-NEXT: paddb (%rdx), %xmm0 308; SSE2-NEXT: movdqa %xmm0, (%rcx) 309; SSE2-NEXT: retq 310; 311; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2: 312; SSE42: # %bb.0: 313; SSE42-NEXT: movdqa (%rdi), %xmm0 314; SSE42-NEXT: paddb (%rsi), %xmm0 315; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 316; SSE42-NEXT: paddb (%rdx), %xmm0 317; SSE42-NEXT: movdqa %xmm0, (%rcx) 318; SSE42-NEXT: retq 319; 320; AVX-LABEL: vec64_v8i8_to_v4i16_factor2: 321; AVX: # %bb.0: 322; AVX-NEXT: vmovdqa (%rdi), %xmm0 323; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 324; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 325; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 326; AVX-NEXT: vmovdqa %xmm0, (%rcx) 327; AVX-NEXT: retq 328; 329; AVX2-LABEL: vec64_v8i8_to_v4i16_factor2: 330; AVX2: # %bb.0: 331; AVX2-NEXT: vmovdqa (%rdi), %xmm0 332; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 333; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 334; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 335; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 336; AVX2-NEXT: vzeroupper 337; AVX2-NEXT: retq 338; 339; AVX512F-LABEL: vec64_v8i8_to_v4i16_factor2: 340; AVX512F: # %bb.0: 341; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 342; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 343; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 344; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 345; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 346; AVX512F-NEXT: vzeroupper 347; AVX512F-NEXT: retq 348; 349; AVX512BW-LABEL: vec64_v8i8_to_v4i16_factor2: 350; AVX512BW: # %bb.0: 351; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 352; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 353; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 354; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 355; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 356; AVX512BW-NEXT: vzeroupper 357; AVX512BW-NEXT: retq 358 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 359 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 360 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 361 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 362 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> 363 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 364 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 365 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 366 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 367 ret void 368} 369 370define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 371; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4: 372; SSE2: # %bb.0: 373; SSE2-NEXT: movdqa (%rdi), %xmm0 374; SSE2-NEXT: paddb (%rsi), %xmm0 375; SSE2-NEXT: pxor %xmm1, %xmm1 376; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 377; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 378; SSE2-NEXT: paddb (%rdx), %xmm0 379; SSE2-NEXT: movdqa %xmm0, (%rcx) 380; SSE2-NEXT: retq 381; 382; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4: 383; SSE42: # %bb.0: 384; SSE42-NEXT: movdqa (%rdi), %xmm0 385; SSE42-NEXT: paddb (%rsi), %xmm0 386; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 387; SSE42-NEXT: paddb (%rdx), %xmm0 388; SSE42-NEXT: movdqa %xmm0, (%rcx) 389; SSE42-NEXT: retq 390; 391; AVX-LABEL: vec64_v8i8_to_v2i32_factor4: 392; AVX: # %bb.0: 393; AVX-NEXT: vmovdqa (%rdi), %xmm0 394; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 395; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 396; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 397; AVX-NEXT: vmovdqa %xmm0, (%rcx) 398; AVX-NEXT: retq 399; 400; AVX2-LABEL: vec64_v8i8_to_v2i32_factor4: 401; AVX2: # %bb.0: 402; AVX2-NEXT: vmovdqa (%rdi), %xmm0 403; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 404; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 405; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 406; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 407; AVX2-NEXT: vzeroupper 408; AVX2-NEXT: retq 409; 410; AVX512F-LABEL: vec64_v8i8_to_v2i32_factor4: 411; AVX512F: # %bb.0: 412; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 413; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 414; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 415; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 416; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 417; AVX512F-NEXT: vzeroupper 418; AVX512F-NEXT: retq 419; 420; AVX512BW-LABEL: vec64_v8i8_to_v2i32_factor4: 421; AVX512BW: # %bb.0: 422; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 423; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 424; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 425; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 426; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 427; AVX512BW-NEXT: vzeroupper 428; AVX512BW-NEXT: retq 429 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 430 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 431 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 432 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 433 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> 434 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 435 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 436 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 437 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 438 ret void 439} 440 441define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 442; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8: 443; SSE2: # %bb.0: 444; SSE2-NEXT: movdqa (%rdi), %xmm0 445; SSE2-NEXT: paddb (%rsi), %xmm0 446; SSE2-NEXT: pxor %xmm1, %xmm1 447; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 448; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 449; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 450; SSE2-NEXT: paddb (%rdx), %xmm0 451; SSE2-NEXT: movdqa %xmm0, (%rcx) 452; SSE2-NEXT: retq 453; 454; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8: 455; SSE42: # %bb.0: 456; SSE42-NEXT: movdqa (%rdi), %xmm0 457; SSE42-NEXT: paddb (%rsi), %xmm0 458; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 459; SSE42-NEXT: paddb (%rdx), %xmm0 460; SSE42-NEXT: movdqa %xmm0, (%rcx) 461; SSE42-NEXT: retq 462; 463; AVX-LABEL: vec64_v8i8_to_v1i64_factor8: 464; AVX: # %bb.0: 465; AVX-NEXT: vmovdqa (%rdi), %xmm0 466; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 467; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 468; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 469; AVX-NEXT: vmovdqa %xmm0, (%rcx) 470; AVX-NEXT: retq 471; 472; AVX2-LABEL: vec64_v8i8_to_v1i64_factor8: 473; AVX2: # %bb.0: 474; AVX2-NEXT: vmovdqa (%rdi), %xmm0 475; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 476; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 477; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 478; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 479; AVX2-NEXT: vzeroupper 480; AVX2-NEXT: retq 481; 482; AVX512F-LABEL: vec64_v8i8_to_v1i64_factor8: 483; AVX512F: # %bb.0: 484; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 485; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 486; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 487; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 488; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 489; AVX512F-NEXT: vzeroupper 490; AVX512F-NEXT: retq 491; 492; AVX512BW-LABEL: vec64_v8i8_to_v1i64_factor8: 493; AVX512BW: # %bb.0: 494; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 495; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 496; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 497; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 498; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 499; AVX512BW-NEXT: vzeroupper 500; AVX512BW-NEXT: retq 501 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 502 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 503 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 504 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 505 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 506 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 507 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 508 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 509 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 510 ret void 511} 512 513define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 514; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2: 515; SSE2: # %bb.0: 516; SSE2-NEXT: movdqa (%rdi), %xmm0 517; SSE2-NEXT: paddb (%rsi), %xmm0 518; SSE2-NEXT: pxor %xmm1, %xmm1 519; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 520; SSE2-NEXT: paddb (%rdx), %xmm0 521; SSE2-NEXT: movdqa %xmm0, (%rcx) 522; SSE2-NEXT: retq 523; 524; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2: 525; SSE42: # %bb.0: 526; SSE42-NEXT: movdqa (%rdi), %xmm0 527; SSE42-NEXT: paddb (%rsi), %xmm0 528; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 529; SSE42-NEXT: paddb (%rdx), %xmm0 530; SSE42-NEXT: movdqa %xmm0, (%rcx) 531; SSE42-NEXT: retq 532; 533; AVX-LABEL: vec64_v4i16_to_v2i32_factor2: 534; AVX: # %bb.0: 535; AVX-NEXT: vmovdqa (%rdi), %xmm0 536; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 537; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 538; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 539; AVX-NEXT: vmovdqa %xmm0, (%rcx) 540; AVX-NEXT: retq 541; 542; AVX2-LABEL: vec64_v4i16_to_v2i32_factor2: 543; AVX2: # %bb.0: 544; AVX2-NEXT: vmovdqa (%rdi), %xmm0 545; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 546; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 547; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 548; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 549; AVX2-NEXT: vzeroupper 550; AVX2-NEXT: retq 551; 552; AVX512F-LABEL: vec64_v4i16_to_v2i32_factor2: 553; AVX512F: # %bb.0: 554; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 555; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 556; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 557; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 558; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 559; AVX512F-NEXT: vzeroupper 560; AVX512F-NEXT: retq 561; 562; AVX512BW-LABEL: vec64_v4i16_to_v2i32_factor2: 563; AVX512BW: # %bb.0: 564; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 565; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 566; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 567; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 568; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 569; AVX512BW-NEXT: vzeroupper 570; AVX512BW-NEXT: retq 571 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 572 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 573 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 574 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 575 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> 576 %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 577 %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> 578 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 579 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 580 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 581 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 582 ret void 583} 584 585define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 586; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4: 587; SSE2: # %bb.0: 588; SSE2-NEXT: movdqa (%rdi), %xmm0 589; SSE2-NEXT: paddb (%rsi), %xmm0 590; SSE2-NEXT: pxor %xmm1, %xmm1 591; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 592; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 593; SSE2-NEXT: paddb (%rdx), %xmm0 594; SSE2-NEXT: movdqa %xmm0, (%rcx) 595; SSE2-NEXT: retq 596; 597; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4: 598; SSE42: # %bb.0: 599; SSE42-NEXT: movdqa (%rdi), %xmm0 600; SSE42-NEXT: paddb (%rsi), %xmm0 601; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 602; SSE42-NEXT: paddb (%rdx), %xmm0 603; SSE42-NEXT: movdqa %xmm0, (%rcx) 604; SSE42-NEXT: retq 605; 606; AVX-LABEL: vec64_v4i16_to_v1i64_factor4: 607; AVX: # %bb.0: 608; AVX-NEXT: vmovdqa (%rdi), %xmm0 609; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 610; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 611; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 612; AVX-NEXT: vmovdqa %xmm0, (%rcx) 613; AVX-NEXT: retq 614; 615; AVX2-LABEL: vec64_v4i16_to_v1i64_factor4: 616; AVX2: # %bb.0: 617; AVX2-NEXT: vmovdqa (%rdi), %xmm0 618; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 619; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 620; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 621; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 622; AVX2-NEXT: vzeroupper 623; AVX2-NEXT: retq 624; 625; AVX512F-LABEL: vec64_v4i16_to_v1i64_factor4: 626; AVX512F: # %bb.0: 627; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 628; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 629; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 630; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 631; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 632; AVX512F-NEXT: vzeroupper 633; AVX512F-NEXT: retq 634; 635; AVX512BW-LABEL: vec64_v4i16_to_v1i64_factor4: 636; AVX512BW: # %bb.0: 637; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 638; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 639; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 640; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 641; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 642; AVX512BW-NEXT: vzeroupper 643; AVX512BW-NEXT: retq 644 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 645 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 646 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 647 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 648 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> 649 %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 650 %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> 651 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 652 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 653 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 654 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 655 ret void 656} 657 658define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 659; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2: 660; SSE2: # %bb.0: 661; SSE2-NEXT: movdqa (%rdi), %xmm0 662; SSE2-NEXT: paddb (%rsi), %xmm0 663; SSE2-NEXT: pxor %xmm1, %xmm1 664; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 665; SSE2-NEXT: paddb (%rdx), %xmm0 666; SSE2-NEXT: movdqa %xmm0, (%rcx) 667; SSE2-NEXT: retq 668; 669; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2: 670; SSE42: # %bb.0: 671; SSE42-NEXT: movdqa (%rdi), %xmm0 672; SSE42-NEXT: paddb (%rsi), %xmm0 673; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 674; SSE42-NEXT: paddb (%rdx), %xmm0 675; SSE42-NEXT: movdqa %xmm0, (%rcx) 676; SSE42-NEXT: retq 677; 678; AVX-LABEL: vec64_v2i32_to_v1i64_factor2: 679; AVX: # %bb.0: 680; AVX-NEXT: vmovdqa (%rdi), %xmm0 681; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 682; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 683; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 684; AVX-NEXT: vmovdqa %xmm0, (%rcx) 685; AVX-NEXT: retq 686; 687; AVX2-LABEL: vec64_v2i32_to_v1i64_factor2: 688; AVX2: # %bb.0: 689; AVX2-NEXT: vmovdqa (%rdi), %xmm0 690; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 691; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 692; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 693; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 694; AVX2-NEXT: vzeroupper 695; AVX2-NEXT: retq 696; 697; AVX512F-LABEL: vec64_v2i32_to_v1i64_factor2: 698; AVX512F: # %bb.0: 699; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 700; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 701; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 702; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 703; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 704; AVX512F-NEXT: vzeroupper 705; AVX512F-NEXT: retq 706; 707; AVX512BW-LABEL: vec64_v2i32_to_v1i64_factor2: 708; AVX512BW: # %bb.0: 709; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 710; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 711; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 712; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 713; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 714; AVX512BW-NEXT: vzeroupper 715; AVX512BW-NEXT: retq 716 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 717 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 718 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 719 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 720 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <2 x i32> 721 %zextd.vec = shufflevector <2 x i32> %in.vec.cast, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 3> 722 %out.bytevec = bitcast <2 x i32> %zextd.vec to <8 x i8> 723 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 724 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 725 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 726 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 727 ret void 728} 729 730define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 731; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2: 732; SSE2: # %bb.0: 733; SSE2-NEXT: movdqa (%rdi), %xmm0 734; SSE2-NEXT: paddb (%rsi), %xmm0 735; SSE2-NEXT: pxor %xmm1, %xmm1 736; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 737; SSE2-NEXT: paddb (%rdx), %xmm0 738; SSE2-NEXT: movdqa %xmm0, (%rcx) 739; SSE2-NEXT: retq 740; 741; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2: 742; SSE42: # %bb.0: 743; SSE42-NEXT: movdqa (%rdi), %xmm0 744; SSE42-NEXT: paddb (%rsi), %xmm0 745; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 746; SSE42-NEXT: paddb (%rdx), %xmm0 747; SSE42-NEXT: movdqa %xmm0, (%rcx) 748; SSE42-NEXT: retq 749; 750; AVX-LABEL: vec128_v16i8_to_v8i16_factor2: 751; AVX: # %bb.0: 752; AVX-NEXT: vmovdqa (%rdi), %xmm0 753; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 754; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 755; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 756; AVX-NEXT: vmovdqa %xmm0, (%rcx) 757; AVX-NEXT: retq 758; 759; AVX2-LABEL: vec128_v16i8_to_v8i16_factor2: 760; AVX2: # %bb.0: 761; AVX2-NEXT: vmovdqa (%rdi), %xmm0 762; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 763; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 764; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 765; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 766; AVX2-NEXT: vzeroupper 767; AVX2-NEXT: retq 768; 769; AVX512F-LABEL: vec128_v16i8_to_v8i16_factor2: 770; AVX512F: # %bb.0: 771; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 772; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 773; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 774; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 775; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 776; AVX512F-NEXT: vzeroupper 777; AVX512F-NEXT: retq 778; 779; AVX512BW-LABEL: vec128_v16i8_to_v8i16_factor2: 780; AVX512BW: # %bb.0: 781; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 782; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 783; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 784; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 785; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 786; AVX512BW-NEXT: vzeroupper 787; AVX512BW-NEXT: retq 788 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 789 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 790 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 791 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 792 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> 793 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 794 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 795 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 796 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 797 ret void 798} 799 800define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 801; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4: 802; SSE2: # %bb.0: 803; SSE2-NEXT: movdqa (%rdi), %xmm0 804; SSE2-NEXT: paddb (%rsi), %xmm0 805; SSE2-NEXT: pxor %xmm1, %xmm1 806; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 807; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 808; SSE2-NEXT: paddb (%rdx), %xmm0 809; SSE2-NEXT: movdqa %xmm0, (%rcx) 810; SSE2-NEXT: retq 811; 812; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4: 813; SSE42: # %bb.0: 814; SSE42-NEXT: movdqa (%rdi), %xmm0 815; SSE42-NEXT: paddb (%rsi), %xmm0 816; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 817; SSE42-NEXT: paddb (%rdx), %xmm0 818; SSE42-NEXT: movdqa %xmm0, (%rcx) 819; SSE42-NEXT: retq 820; 821; AVX-LABEL: vec128_v16i8_to_v4i32_factor4: 822; AVX: # %bb.0: 823; AVX-NEXT: vmovdqa (%rdi), %xmm0 824; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 825; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 826; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 827; AVX-NEXT: vmovdqa %xmm0, (%rcx) 828; AVX-NEXT: retq 829; 830; AVX2-LABEL: vec128_v16i8_to_v4i32_factor4: 831; AVX2: # %bb.0: 832; AVX2-NEXT: vmovdqa (%rdi), %xmm0 833; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 834; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 835; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 836; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 837; AVX2-NEXT: vzeroupper 838; AVX2-NEXT: retq 839; 840; AVX512F-LABEL: vec128_v16i8_to_v4i32_factor4: 841; AVX512F: # %bb.0: 842; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 843; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 844; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 845; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 846; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 847; AVX512F-NEXT: vzeroupper 848; AVX512F-NEXT: retq 849; 850; AVX512BW-LABEL: vec128_v16i8_to_v4i32_factor4: 851; AVX512BW: # %bb.0: 852; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 853; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 854; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 855; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 856; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 857; AVX512BW-NEXT: vzeroupper 858; AVX512BW-NEXT: retq 859 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 860 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 861 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 862 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 863 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> 864 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 865 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 866 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 867 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 868 ret void 869} 870 871define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 872; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8: 873; SSE2: # %bb.0: 874; SSE2-NEXT: movdqa (%rdi), %xmm0 875; SSE2-NEXT: paddb (%rsi), %xmm0 876; SSE2-NEXT: pxor %xmm1, %xmm1 877; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 878; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 879; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 880; SSE2-NEXT: paddb (%rdx), %xmm0 881; SSE2-NEXT: movdqa %xmm0, (%rcx) 882; SSE2-NEXT: retq 883; 884; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8: 885; SSE42: # %bb.0: 886; SSE42-NEXT: movdqa (%rdi), %xmm0 887; SSE42-NEXT: paddb (%rsi), %xmm0 888; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 889; SSE42-NEXT: paddb (%rdx), %xmm0 890; SSE42-NEXT: movdqa %xmm0, (%rcx) 891; SSE42-NEXT: retq 892; 893; AVX-LABEL: vec128_v16i8_to_v2i64_factor8: 894; AVX: # %bb.0: 895; AVX-NEXT: vmovdqa (%rdi), %xmm0 896; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 897; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 898; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 899; AVX-NEXT: vmovdqa %xmm0, (%rcx) 900; AVX-NEXT: retq 901; 902; AVX2-LABEL: vec128_v16i8_to_v2i64_factor8: 903; AVX2: # %bb.0: 904; AVX2-NEXT: vmovdqa (%rdi), %xmm0 905; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 906; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 907; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 908; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 909; AVX2-NEXT: vzeroupper 910; AVX2-NEXT: retq 911; 912; AVX512F-LABEL: vec128_v16i8_to_v2i64_factor8: 913; AVX512F: # %bb.0: 914; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 915; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 916; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 917; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 918; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 919; AVX512F-NEXT: vzeroupper 920; AVX512F-NEXT: retq 921; 922; AVX512BW-LABEL: vec128_v16i8_to_v2i64_factor8: 923; AVX512BW: # %bb.0: 924; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 925; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 926; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 927; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 928; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 929; AVX512BW-NEXT: vzeroupper 930; AVX512BW-NEXT: retq 931 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 932 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 933 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 934 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 935 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 936 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 937 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 938 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 939 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 940 ret void 941} 942 943define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 944; SSE-LABEL: vec128_v16i8_to_v1i128_factor16: 945; SSE: # %bb.0: 946; SSE-NEXT: movdqa (%rdi), %xmm0 947; SSE-NEXT: paddb (%rsi), %xmm0 948; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 949; SSE-NEXT: paddb (%rdx), %xmm0 950; SSE-NEXT: movdqa %xmm0, (%rcx) 951; SSE-NEXT: retq 952; 953; AVX-LABEL: vec128_v16i8_to_v1i128_factor16: 954; AVX: # %bb.0: 955; AVX-NEXT: vmovdqa (%rdi), %xmm0 956; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 957; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 958; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 959; AVX-NEXT: vmovdqa %xmm0, (%rcx) 960; AVX-NEXT: retq 961; 962; AVX2-LABEL: vec128_v16i8_to_v1i128_factor16: 963; AVX2: # %bb.0: 964; AVX2-NEXT: vmovdqa (%rdi), %xmm0 965; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 966; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 967; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 968; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 969; AVX2-NEXT: vzeroupper 970; AVX2-NEXT: retq 971; 972; AVX512F-LABEL: vec128_v16i8_to_v1i128_factor16: 973; AVX512F: # %bb.0: 974; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 975; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 976; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 977; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 978; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 979; AVX512F-NEXT: vzeroupper 980; AVX512F-NEXT: retq 981; 982; AVX512BW-LABEL: vec128_v16i8_to_v1i128_factor16: 983; AVX512BW: # %bb.0: 984; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 985; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 986; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 987; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 988; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 989; AVX512BW-NEXT: vzeroupper 990; AVX512BW-NEXT: retq 991 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 992 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 993 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 994 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 995 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 996 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 997 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 998 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 999 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1000 ret void 1001} 1002 1003define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1004; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2: 1005; SSE2: # %bb.0: 1006; SSE2-NEXT: movdqa (%rdi), %xmm0 1007; SSE2-NEXT: paddb (%rsi), %xmm0 1008; SSE2-NEXT: pxor %xmm1, %xmm1 1009; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1010; SSE2-NEXT: paddb (%rdx), %xmm0 1011; SSE2-NEXT: movdqa %xmm0, (%rcx) 1012; SSE2-NEXT: retq 1013; 1014; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2: 1015; SSE42: # %bb.0: 1016; SSE42-NEXT: movdqa (%rdi), %xmm0 1017; SSE42-NEXT: paddb (%rsi), %xmm0 1018; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1019; SSE42-NEXT: paddb (%rdx), %xmm0 1020; SSE42-NEXT: movdqa %xmm0, (%rcx) 1021; SSE42-NEXT: retq 1022; 1023; AVX-LABEL: vec128_v8i16_to_v4i32_factor2: 1024; AVX: # %bb.0: 1025; AVX-NEXT: vmovdqa (%rdi), %xmm0 1026; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1027; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1028; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1029; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1030; AVX-NEXT: retq 1031; 1032; AVX2-LABEL: vec128_v8i16_to_v4i32_factor2: 1033; AVX2: # %bb.0: 1034; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1035; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1036; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1037; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1038; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1039; AVX2-NEXT: vzeroupper 1040; AVX2-NEXT: retq 1041; 1042; AVX512F-LABEL: vec128_v8i16_to_v4i32_factor2: 1043; AVX512F: # %bb.0: 1044; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1045; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1046; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1047; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1048; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1049; AVX512F-NEXT: vzeroupper 1050; AVX512F-NEXT: retq 1051; 1052; AVX512BW-LABEL: vec128_v8i16_to_v4i32_factor2: 1053; AVX512BW: # %bb.0: 1054; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1055; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1056; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1057; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1058; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1059; AVX512BW-NEXT: vzeroupper 1060; AVX512BW-NEXT: retq 1061 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1062 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1063 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1064 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1065 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> 1066 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> 1067 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> 1068 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1069 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1070 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1071 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1072 ret void 1073} 1074 1075define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1076; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4: 1077; SSE2: # %bb.0: 1078; SSE2-NEXT: movdqa (%rdi), %xmm0 1079; SSE2-NEXT: paddb (%rsi), %xmm0 1080; SSE2-NEXT: pxor %xmm1, %xmm1 1081; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1082; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1083; SSE2-NEXT: paddb (%rdx), %xmm0 1084; SSE2-NEXT: movdqa %xmm0, (%rcx) 1085; SSE2-NEXT: retq 1086; 1087; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4: 1088; SSE42: # %bb.0: 1089; SSE42-NEXT: movdqa (%rdi), %xmm0 1090; SSE42-NEXT: paddb (%rsi), %xmm0 1091; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1092; SSE42-NEXT: paddb (%rdx), %xmm0 1093; SSE42-NEXT: movdqa %xmm0, (%rcx) 1094; SSE42-NEXT: retq 1095; 1096; AVX-LABEL: vec128_v8i16_to_v2i64_factor4: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vmovdqa (%rdi), %xmm0 1099; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1100; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1101; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1102; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1103; AVX-NEXT: retq 1104; 1105; AVX2-LABEL: vec128_v8i16_to_v2i64_factor4: 1106; AVX2: # %bb.0: 1107; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1108; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1109; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1110; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1111; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1112; AVX2-NEXT: vzeroupper 1113; AVX2-NEXT: retq 1114; 1115; AVX512F-LABEL: vec128_v8i16_to_v2i64_factor4: 1116; AVX512F: # %bb.0: 1117; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1118; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1119; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1120; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1121; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1122; AVX512F-NEXT: vzeroupper 1123; AVX512F-NEXT: retq 1124; 1125; AVX512BW-LABEL: vec128_v8i16_to_v2i64_factor4: 1126; AVX512BW: # %bb.0: 1127; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1128; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1129; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1130; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1131; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1132; AVX512BW-NEXT: vzeroupper 1133; AVX512BW-NEXT: retq 1134 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1135 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1136 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1137 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1138 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> 1139 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> 1140 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> 1141 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1142 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1143 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1144 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1145 ret void 1146} 1147 1148define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1149; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8: 1150; SSE2: # %bb.0: 1151; SSE2-NEXT: movdqa (%rdi), %xmm0 1152; SSE2-NEXT: paddb (%rsi), %xmm0 1153; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1154; SSE2-NEXT: paddb (%rdx), %xmm0 1155; SSE2-NEXT: movdqa %xmm0, (%rcx) 1156; SSE2-NEXT: retq 1157; 1158; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8: 1159; SSE42: # %bb.0: 1160; SSE42-NEXT: movdqa (%rdi), %xmm0 1161; SSE42-NEXT: paddb (%rsi), %xmm0 1162; SSE42-NEXT: pxor %xmm1, %xmm1 1163; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1164; SSE42-NEXT: paddb (%rdx), %xmm1 1165; SSE42-NEXT: movdqa %xmm1, (%rcx) 1166; SSE42-NEXT: retq 1167; 1168; AVX-LABEL: vec128_v8i16_to_v1i128_factor8: 1169; AVX: # %bb.0: 1170; AVX-NEXT: vmovdqa (%rdi), %xmm0 1171; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1172; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1173; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1174; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1175; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1176; AVX-NEXT: retq 1177; 1178; AVX2-LABEL: vec128_v8i16_to_v1i128_factor8: 1179; AVX2: # %bb.0: 1180; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1181; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1182; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1183; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1184; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1185; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1186; AVX2-NEXT: vzeroupper 1187; AVX2-NEXT: retq 1188; 1189; AVX512F-LABEL: vec128_v8i16_to_v1i128_factor8: 1190; AVX512F: # %bb.0: 1191; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1192; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1193; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1194; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1195; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1196; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1197; AVX512F-NEXT: vzeroupper 1198; AVX512F-NEXT: retq 1199; 1200; AVX512BW-LABEL: vec128_v8i16_to_v1i128_factor8: 1201; AVX512BW: # %bb.0: 1202; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1203; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1204; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1205; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1206; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1207; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1208; AVX512BW-NEXT: vzeroupper 1209; AVX512BW-NEXT: retq 1210 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1211 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1212 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1213 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1214 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> 1215 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1216 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> 1217 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1218 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1219 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1220 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1221 ret void 1222} 1223 1224define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1225; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2: 1226; SSE2: # %bb.0: 1227; SSE2-NEXT: movdqa (%rdi), %xmm0 1228; SSE2-NEXT: paddb (%rsi), %xmm0 1229; SSE2-NEXT: pxor %xmm1, %xmm1 1230; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1231; SSE2-NEXT: paddb (%rdx), %xmm0 1232; SSE2-NEXT: movdqa %xmm0, (%rcx) 1233; SSE2-NEXT: retq 1234; 1235; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2: 1236; SSE42: # %bb.0: 1237; SSE42-NEXT: movdqa (%rdi), %xmm0 1238; SSE42-NEXT: paddb (%rsi), %xmm0 1239; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1240; SSE42-NEXT: paddb (%rdx), %xmm0 1241; SSE42-NEXT: movdqa %xmm0, (%rcx) 1242; SSE42-NEXT: retq 1243; 1244; AVX-LABEL: vec128_v4i32_to_v2i64_factor2: 1245; AVX: # %bb.0: 1246; AVX-NEXT: vmovdqa (%rdi), %xmm0 1247; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1248; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1249; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1250; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1251; AVX-NEXT: retq 1252; 1253; AVX2-LABEL: vec128_v4i32_to_v2i64_factor2: 1254; AVX2: # %bb.0: 1255; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1256; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1257; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1258; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1259; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1260; AVX2-NEXT: vzeroupper 1261; AVX2-NEXT: retq 1262; 1263; AVX512F-LABEL: vec128_v4i32_to_v2i64_factor2: 1264; AVX512F: # %bb.0: 1265; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1266; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1267; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1268; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1269; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1270; AVX512F-NEXT: vzeroupper 1271; AVX512F-NEXT: retq 1272; 1273; AVX512BW-LABEL: vec128_v4i32_to_v2i64_factor2: 1274; AVX512BW: # %bb.0: 1275; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1276; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1277; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1278; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1279; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1280; AVX512BW-NEXT: vzeroupper 1281; AVX512BW-NEXT: retq 1282 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1283 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1284 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1285 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1286 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> 1287 %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1288 %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> 1289 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1290 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1291 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1292 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1293 ret void 1294} 1295 1296define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1297; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4: 1298; SSE2: # %bb.0: 1299; SSE2-NEXT: movdqa (%rdi), %xmm0 1300; SSE2-NEXT: paddb (%rsi), %xmm0 1301; SSE2-NEXT: xorps %xmm1, %xmm1 1302; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1303; SSE2-NEXT: paddb (%rdx), %xmm1 1304; SSE2-NEXT: movdqa %xmm1, (%rcx) 1305; SSE2-NEXT: retq 1306; 1307; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4: 1308; SSE42: # %bb.0: 1309; SSE42-NEXT: movdqa (%rdi), %xmm0 1310; SSE42-NEXT: paddb (%rsi), %xmm0 1311; SSE42-NEXT: pxor %xmm1, %xmm1 1312; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1313; SSE42-NEXT: paddb (%rdx), %xmm1 1314; SSE42-NEXT: movdqa %xmm1, (%rcx) 1315; SSE42-NEXT: retq 1316; 1317; AVX-LABEL: vec128_v4i32_to_v1i128_factor4: 1318; AVX: # %bb.0: 1319; AVX-NEXT: vmovdqa (%rdi), %xmm0 1320; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1321; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1322; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1323; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1324; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1325; AVX-NEXT: retq 1326; 1327; AVX2-LABEL: vec128_v4i32_to_v1i128_factor4: 1328; AVX2: # %bb.0: 1329; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1330; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1331; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1332; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1333; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1334; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1335; AVX2-NEXT: vzeroupper 1336; AVX2-NEXT: retq 1337; 1338; AVX512F-LABEL: vec128_v4i32_to_v1i128_factor4: 1339; AVX512F: # %bb.0: 1340; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1341; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1342; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1343; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1344; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1345; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1346; AVX512F-NEXT: vzeroupper 1347; AVX512F-NEXT: retq 1348; 1349; AVX512BW-LABEL: vec128_v4i32_to_v1i128_factor4: 1350; AVX512BW: # %bb.0: 1351; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1352; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1353; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1354; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1355; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1356; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1357; AVX512BW-NEXT: vzeroupper 1358; AVX512BW-NEXT: retq 1359 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1360 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1361 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1362 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1363 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> 1364 %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1365 %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> 1366 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1367 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1368 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1369 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1370 ret void 1371} 1372 1373define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1374; SSE-LABEL: vec128_v2i64_to_v1i128_factor2: 1375; SSE: # %bb.0: 1376; SSE-NEXT: movdqa (%rdi), %xmm0 1377; SSE-NEXT: paddb (%rsi), %xmm0 1378; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 1379; SSE-NEXT: paddb (%rdx), %xmm0 1380; SSE-NEXT: movdqa %xmm0, (%rcx) 1381; SSE-NEXT: retq 1382; 1383; AVX-LABEL: vec128_v2i64_to_v1i128_factor2: 1384; AVX: # %bb.0: 1385; AVX-NEXT: vmovdqa (%rdi), %xmm0 1386; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1387; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1388; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1389; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1390; AVX-NEXT: retq 1391; 1392; AVX2-LABEL: vec128_v2i64_to_v1i128_factor2: 1393; AVX2: # %bb.0: 1394; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1395; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1396; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1397; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1398; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1399; AVX2-NEXT: vzeroupper 1400; AVX2-NEXT: retq 1401; 1402; AVX512F-LABEL: vec128_v2i64_to_v1i128_factor2: 1403; AVX512F: # %bb.0: 1404; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1405; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1406; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1407; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1408; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1409; AVX512F-NEXT: vzeroupper 1410; AVX512F-NEXT: retq 1411; 1412; AVX512BW-LABEL: vec128_v2i64_to_v1i128_factor2: 1413; AVX512BW: # %bb.0: 1414; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1415; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1416; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1417; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1418; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1419; AVX512BW-NEXT: vzeroupper 1420; AVX512BW-NEXT: retq 1421 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1422 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1423 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1424 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1425 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <2 x i64> 1426 %zextd.vec = shufflevector <2 x i64> %in.vec.cast, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> 1427 %out.bytevec = bitcast <2 x i64> %zextd.vec to <16 x i8> 1428 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1429 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1430 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1431 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1432 ret void 1433} 1434 1435define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1436; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2: 1437; SSE2: # %bb.0: 1438; SSE2-NEXT: movdqa (%rdi), %xmm0 1439; SSE2-NEXT: paddb (%rsi), %xmm0 1440; SSE2-NEXT: pxor %xmm1, %xmm1 1441; SSE2-NEXT: movdqa %xmm0, %xmm2 1442; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1443; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1444; SSE2-NEXT: paddb 16(%rdx), %xmm0 1445; SSE2-NEXT: paddb (%rdx), %xmm2 1446; SSE2-NEXT: movdqa %xmm2, (%rcx) 1447; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1448; SSE2-NEXT: retq 1449; 1450; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2: 1451; SSE42: # %bb.0: 1452; SSE42-NEXT: movdqa (%rdi), %xmm0 1453; SSE42-NEXT: paddb (%rsi), %xmm0 1454; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1455; SSE42-NEXT: pxor %xmm2, %xmm2 1456; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1457; SSE42-NEXT: paddb 16(%rdx), %xmm0 1458; SSE42-NEXT: paddb (%rdx), %xmm1 1459; SSE42-NEXT: movdqa %xmm1, (%rcx) 1460; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1461; SSE42-NEXT: retq 1462; 1463; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: 1464; AVX: # %bb.0: 1465; AVX-NEXT: vmovdqa (%rdi), %xmm0 1466; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1467; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1468; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1469; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1470; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1471; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1472; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1473; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1474; AVX-NEXT: retq 1475; 1476; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2: 1477; AVX2: # %bb.0: 1478; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1479; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1480; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1481; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1482; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1483; AVX2-NEXT: vzeroupper 1484; AVX2-NEXT: retq 1485; 1486; AVX512F-LABEL: vec256_v32i8_to_v16i16_factor2: 1487; AVX512F: # %bb.0: 1488; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1489; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1490; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1491; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1492; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1493; AVX512F-NEXT: vzeroupper 1494; AVX512F-NEXT: retq 1495; 1496; AVX512BW-LABEL: vec256_v32i8_to_v16i16_factor2: 1497; AVX512BW: # %bb.0: 1498; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1499; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1500; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1501; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1502; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1503; AVX512BW-NEXT: vzeroupper 1504; AVX512BW-NEXT: retq 1505 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1506 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1507 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1508 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1509 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63> 1510 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1511 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1512 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1513 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1514 ret void 1515} 1516 1517define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1518; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4: 1519; SSE2: # %bb.0: 1520; SSE2-NEXT: movdqa (%rdi), %xmm0 1521; SSE2-NEXT: paddb (%rsi), %xmm0 1522; SSE2-NEXT: pxor %xmm1, %xmm1 1523; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1524; SSE2-NEXT: movdqa %xmm0, %xmm2 1525; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1526; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1527; SSE2-NEXT: paddb 16(%rdx), %xmm0 1528; SSE2-NEXT: paddb (%rdx), %xmm2 1529; SSE2-NEXT: movdqa %xmm2, (%rcx) 1530; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1531; SSE2-NEXT: retq 1532; 1533; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4: 1534; SSE42: # %bb.0: 1535; SSE42-NEXT: movdqa (%rdi), %xmm0 1536; SSE42-NEXT: paddb (%rsi), %xmm0 1537; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1538; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1539; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1540; SSE42-NEXT: paddb 16(%rdx), %xmm0 1541; SSE42-NEXT: paddb (%rdx), %xmm1 1542; SSE42-NEXT: movdqa %xmm1, (%rcx) 1543; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1544; SSE42-NEXT: retq 1545; 1546; AVX-LABEL: vec256_v32i8_to_v8i32_factor4: 1547; AVX: # %bb.0: 1548; AVX-NEXT: vmovdqa (%rdi), %xmm0 1549; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1550; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1551; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1552; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1553; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1554; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1555; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1556; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1557; AVX-NEXT: retq 1558; 1559; AVX2-LABEL: vec256_v32i8_to_v8i32_factor4: 1560; AVX2: # %bb.0: 1561; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1562; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1563; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1564; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1565; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1566; AVX2-NEXT: vzeroupper 1567; AVX2-NEXT: retq 1568; 1569; AVX512F-LABEL: vec256_v32i8_to_v8i32_factor4: 1570; AVX512F: # %bb.0: 1571; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1572; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1573; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1574; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1575; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1576; AVX512F-NEXT: vzeroupper 1577; AVX512F-NEXT: retq 1578; 1579; AVX512BW-LABEL: vec256_v32i8_to_v8i32_factor4: 1580; AVX512BW: # %bb.0: 1581; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1582; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1583; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1584; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1585; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1586; AVX512BW-NEXT: vzeroupper 1587; AVX512BW-NEXT: retq 1588 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1589 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1590 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1591 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1592 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63> 1593 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1594 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1595 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1596 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1597 ret void 1598} 1599 1600define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1601; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8: 1602; SSE2: # %bb.0: 1603; SSE2-NEXT: movdqa (%rdi), %xmm0 1604; SSE2-NEXT: paddb (%rsi), %xmm0 1605; SSE2-NEXT: pxor %xmm1, %xmm1 1606; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1607; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1608; SSE2-NEXT: movdqa %xmm0, %xmm2 1609; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1610; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1611; SSE2-NEXT: paddb 16(%rdx), %xmm0 1612; SSE2-NEXT: paddb (%rdx), %xmm2 1613; SSE2-NEXT: movdqa %xmm2, (%rcx) 1614; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1615; SSE2-NEXT: retq 1616; 1617; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8: 1618; SSE42: # %bb.0: 1619; SSE42-NEXT: movdqa (%rdi), %xmm0 1620; SSE42-NEXT: paddb (%rsi), %xmm0 1621; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1622; SSE42-NEXT: psrld $16, %xmm0 1623; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1624; SSE42-NEXT: paddb 16(%rdx), %xmm0 1625; SSE42-NEXT: paddb (%rdx), %xmm1 1626; SSE42-NEXT: movdqa %xmm1, (%rcx) 1627; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1628; SSE42-NEXT: retq 1629; 1630; AVX-LABEL: vec256_v32i8_to_v4i64_factor8: 1631; AVX: # %bb.0: 1632; AVX-NEXT: vmovdqa (%rdi), %xmm0 1633; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1634; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1635; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1636; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1637; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1638; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1639; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1640; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1641; AVX-NEXT: retq 1642; 1643; AVX2-LABEL: vec256_v32i8_to_v4i64_factor8: 1644; AVX2: # %bb.0: 1645; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1646; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1647; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 1648; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1649; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1650; AVX2-NEXT: vzeroupper 1651; AVX2-NEXT: retq 1652; 1653; AVX512F-LABEL: vec256_v32i8_to_v4i64_factor8: 1654; AVX512F: # %bb.0: 1655; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1656; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1657; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 1658; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1659; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1660; AVX512F-NEXT: vzeroupper 1661; AVX512F-NEXT: retq 1662; 1663; AVX512BW-LABEL: vec256_v32i8_to_v4i64_factor8: 1664; AVX512BW: # %bb.0: 1665; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1666; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1667; AVX512BW-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 1668; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1669; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1670; AVX512BW-NEXT: vzeroupper 1671; AVX512BW-NEXT: retq 1672 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1673 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1674 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1675 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1676 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1677 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1678 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1679 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1680 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1681 ret void 1682} 1683 1684define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1685; SSE2-LABEL: vec256_v32i8_to_v2i128_factor16: 1686; SSE2: # %bb.0: 1687; SSE2-NEXT: movdqa (%rdi), %xmm0 1688; SSE2-NEXT: paddb (%rsi), %xmm0 1689; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] 1690; SSE2-NEXT: pand %xmm0, %xmm1 1691; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 1692; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1693; SSE2-NEXT: paddb 16(%rdx), %xmm0 1694; SSE2-NEXT: paddb (%rdx), %xmm1 1695; SSE2-NEXT: movdqa %xmm1, (%rcx) 1696; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1697; SSE2-NEXT: retq 1698; 1699; SSE42-LABEL: vec256_v32i8_to_v2i128_factor16: 1700; SSE42: # %bb.0: 1701; SSE42-NEXT: movdqa (%rdi), %xmm0 1702; SSE42-NEXT: paddb (%rsi), %xmm0 1703; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] 1704; SSE42-NEXT: pand %xmm0, %xmm1 1705; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 1706; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1707; SSE42-NEXT: paddb 16(%rdx), %xmm0 1708; SSE42-NEXT: paddb (%rdx), %xmm1 1709; SSE42-NEXT: movdqa %xmm1, (%rcx) 1710; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1711; SSE42-NEXT: retq 1712; 1713; AVX-LABEL: vec256_v32i8_to_v2i128_factor16: 1714; AVX: # %bb.0: 1715; AVX-NEXT: vmovdqa (%rdi), %xmm0 1716; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1717; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1718; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 1719; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1720; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1721; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1722; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1723; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1724; AVX-NEXT: retq 1725; 1726; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16: 1727; AVX2: # %bb.0: 1728; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1729; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1730; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1731; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 1732; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1733; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1734; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1735; AVX2-NEXT: vzeroupper 1736; AVX2-NEXT: retq 1737; 1738; AVX512F-LABEL: vec256_v32i8_to_v2i128_factor16: 1739; AVX512F: # %bb.0: 1740; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1741; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1742; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1743; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 1744; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1745; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1746; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1747; AVX512F-NEXT: vzeroupper 1748; AVX512F-NEXT: retq 1749; 1750; AVX512BW-LABEL: vec256_v32i8_to_v2i128_factor16: 1751; AVX512BW: # %bb.0: 1752; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1753; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1754; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1755; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 1756; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1757; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1758; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1759; AVX512BW-NEXT: vzeroupper 1760; AVX512BW-NEXT: retq 1761 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1762 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1763 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1764 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1765 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1766 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1767 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1768 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1769 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1770 ret void 1771} 1772 1773define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1774; SSE-LABEL: vec256_v32i8_to_v1i256_factor32: 1775; SSE: # %bb.0: 1776; SSE-NEXT: movdqa (%rdi), %xmm0 1777; SSE-NEXT: paddb (%rsi), %xmm0 1778; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1779; SSE-NEXT: movaps 16(%rdx), %xmm1 1780; SSE-NEXT: paddb (%rdx), %xmm0 1781; SSE-NEXT: movaps %xmm1, 16(%rcx) 1782; SSE-NEXT: movdqa %xmm0, (%rcx) 1783; SSE-NEXT: retq 1784; 1785; AVX-LABEL: vec256_v32i8_to_v1i256_factor32: 1786; AVX: # %bb.0: 1787; AVX-NEXT: vmovdqa (%rdi), %xmm0 1788; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1789; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1790; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1791; AVX-NEXT: vmovaps 16(%rdx), %xmm1 1792; AVX-NEXT: vmovaps %xmm1, 16(%rcx) 1793; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1794; AVX-NEXT: retq 1795; 1796; AVX2-LABEL: vec256_v32i8_to_v1i256_factor32: 1797; AVX2: # %bb.0: 1798; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1799; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1800; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 1801; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1802; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1803; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1804; AVX2-NEXT: vzeroupper 1805; AVX2-NEXT: retq 1806; 1807; AVX512F-LABEL: vec256_v32i8_to_v1i256_factor32: 1808; AVX512F: # %bb.0: 1809; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1810; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1811; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 1812; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 1813; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1814; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1815; AVX512F-NEXT: vzeroupper 1816; AVX512F-NEXT: retq 1817; 1818; AVX512BW-LABEL: vec256_v32i8_to_v1i256_factor32: 1819; AVX512BW: # %bb.0: 1820; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1821; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1822; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 1823; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 1824; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1825; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1826; AVX512BW-NEXT: vzeroupper 1827; AVX512BW-NEXT: retq 1828 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1829 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1830 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1831 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1832 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1833 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1834 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1835 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1836 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1837 ret void 1838} 1839 1840define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1841; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2: 1842; SSE2: # %bb.0: 1843; SSE2-NEXT: movdqa (%rdi), %xmm0 1844; SSE2-NEXT: paddb (%rsi), %xmm0 1845; SSE2-NEXT: pxor %xmm1, %xmm1 1846; SSE2-NEXT: movdqa %xmm0, %xmm2 1847; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1848; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1849; SSE2-NEXT: paddb 16(%rdx), %xmm0 1850; SSE2-NEXT: paddb (%rdx), %xmm2 1851; SSE2-NEXT: movdqa %xmm2, (%rcx) 1852; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1853; SSE2-NEXT: retq 1854; 1855; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2: 1856; SSE42: # %bb.0: 1857; SSE42-NEXT: movdqa (%rdi), %xmm0 1858; SSE42-NEXT: paddb (%rsi), %xmm0 1859; SSE42-NEXT: pxor %xmm1, %xmm1 1860; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1861; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1862; SSE42-NEXT: paddb 16(%rdx), %xmm0 1863; SSE42-NEXT: paddb (%rdx), %xmm2 1864; SSE42-NEXT: movdqa %xmm2, (%rcx) 1865; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1866; SSE42-NEXT: retq 1867; 1868; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: 1869; AVX: # %bb.0: 1870; AVX-NEXT: vmovdqa (%rdi), %xmm0 1871; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1872; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1873; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1874; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1875; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1876; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1877; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1878; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1879; AVX-NEXT: retq 1880; 1881; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2: 1882; AVX2: # %bb.0: 1883; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1884; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1885; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1886; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1887; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1888; AVX2-NEXT: vzeroupper 1889; AVX2-NEXT: retq 1890; 1891; AVX512F-LABEL: vec256_v16i16_to_v8i32_factor2: 1892; AVX512F: # %bb.0: 1893; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1894; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1895; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1896; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1897; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1898; AVX512F-NEXT: vzeroupper 1899; AVX512F-NEXT: retq 1900; 1901; AVX512BW-LABEL: vec256_v16i16_to_v8i32_factor2: 1902; AVX512BW: # %bb.0: 1903; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1904; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1905; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1906; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1907; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1908; AVX512BW-NEXT: vzeroupper 1909; AVX512BW-NEXT: retq 1910 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1911 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1912 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1913 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1914 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> 1915 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> 1916 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> 1917 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1918 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1919 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1920 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1921 ret void 1922} 1923 1924define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1925; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4: 1926; SSE2: # %bb.0: 1927; SSE2-NEXT: movdqa (%rdi), %xmm0 1928; SSE2-NEXT: paddb (%rsi), %xmm0 1929; SSE2-NEXT: pxor %xmm1, %xmm1 1930; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1931; SSE2-NEXT: movdqa %xmm0, %xmm2 1932; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1933; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1934; SSE2-NEXT: paddb 16(%rdx), %xmm0 1935; SSE2-NEXT: paddb (%rdx), %xmm2 1936; SSE2-NEXT: movdqa %xmm2, (%rcx) 1937; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1938; SSE2-NEXT: retq 1939; 1940; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4: 1941; SSE42: # %bb.0: 1942; SSE42-NEXT: movdqa (%rdi), %xmm0 1943; SSE42-NEXT: paddb (%rsi), %xmm0 1944; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1945; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1946; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1947; SSE42-NEXT: paddb 16(%rdx), %xmm0 1948; SSE42-NEXT: paddb (%rdx), %xmm1 1949; SSE42-NEXT: movdqa %xmm1, (%rcx) 1950; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1951; SSE42-NEXT: retq 1952; 1953; AVX-LABEL: vec256_v16i16_to_v4i64_factor4: 1954; AVX: # %bb.0: 1955; AVX-NEXT: vmovdqa (%rdi), %xmm0 1956; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1957; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1958; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1959; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1960; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1961; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1962; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1963; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1964; AVX-NEXT: retq 1965; 1966; AVX2-LABEL: vec256_v16i16_to_v4i64_factor4: 1967; AVX2: # %bb.0: 1968; AVX2-NEXT: vmovdqa (%rdi), %xmm0 1969; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1970; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1971; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1972; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1973; AVX2-NEXT: vzeroupper 1974; AVX2-NEXT: retq 1975; 1976; AVX512F-LABEL: vec256_v16i16_to_v4i64_factor4: 1977; AVX512F: # %bb.0: 1978; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1979; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1980; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1981; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1982; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1983; AVX512F-NEXT: vzeroupper 1984; AVX512F-NEXT: retq 1985; 1986; AVX512BW-LABEL: vec256_v16i16_to_v4i64_factor4: 1987; AVX512BW: # %bb.0: 1988; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1989; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1990; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1991; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1992; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1993; AVX512BW-NEXT: vzeroupper 1994; AVX512BW-NEXT: retq 1995 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1996 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1997 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1998 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1999 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> 2000 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> 2001 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> 2002 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2003 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2004 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2005 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2006 ret void 2007} 2008 2009define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2010; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8: 2011; SSE2: # %bb.0: 2012; SSE2-NEXT: movdqa (%rdi), %xmm0 2013; SSE2-NEXT: paddb (%rsi), %xmm0 2014; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 2015; SSE2-NEXT: pand %xmm0, %xmm1 2016; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2017; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2018; SSE2-NEXT: paddb 16(%rdx), %xmm0 2019; SSE2-NEXT: paddb (%rdx), %xmm1 2020; SSE2-NEXT: movdqa %xmm1, (%rcx) 2021; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2022; SSE2-NEXT: retq 2023; 2024; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8: 2025; SSE42: # %bb.0: 2026; SSE42-NEXT: movdqa (%rdi), %xmm0 2027; SSE42-NEXT: paddb (%rsi), %xmm0 2028; SSE42-NEXT: pxor %xmm1, %xmm1 2029; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 2030; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2031; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2032; SSE42-NEXT: paddb 16(%rdx), %xmm0 2033; SSE42-NEXT: paddb (%rdx), %xmm1 2034; SSE42-NEXT: movdqa %xmm1, (%rcx) 2035; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2036; SSE42-NEXT: retq 2037; 2038; AVX-LABEL: vec256_v16i16_to_v2i128_factor8: 2039; AVX: # %bb.0: 2040; AVX-NEXT: vmovdqa (%rdi), %xmm0 2041; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2042; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2043; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 2044; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2045; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2046; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2047; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2048; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2049; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2050; AVX-NEXT: retq 2051; 2052; AVX2-LABEL: vec256_v16i16_to_v2i128_factor8: 2053; AVX2: # %bb.0: 2054; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2055; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2056; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2057; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 2058; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2059; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 2060; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2061; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2062; AVX2-NEXT: vzeroupper 2063; AVX2-NEXT: retq 2064; 2065; AVX512F-LABEL: vec256_v16i16_to_v2i128_factor8: 2066; AVX512F: # %bb.0: 2067; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2068; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2069; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2070; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 2071; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2072; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 2073; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2074; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2075; AVX512F-NEXT: vzeroupper 2076; AVX512F-NEXT: retq 2077; 2078; AVX512BW-LABEL: vec256_v16i16_to_v2i128_factor8: 2079; AVX512BW: # %bb.0: 2080; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2081; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2082; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] 2083; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2084; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 2085; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 2086; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2087; AVX512BW-NEXT: vzeroupper 2088; AVX512BW-NEXT: retq 2089 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2090 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2091 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2092 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2093 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> 2094 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2095 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> 2096 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2097 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2098 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2099 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2100 ret void 2101} 2102 2103define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2104; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16: 2105; SSE2: # %bb.0: 2106; SSE2-NEXT: movdqa (%rdi), %xmm0 2107; SSE2-NEXT: paddb (%rsi), %xmm0 2108; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2109; SSE2-NEXT: movaps 16(%rdx), %xmm1 2110; SSE2-NEXT: paddb (%rdx), %xmm0 2111; SSE2-NEXT: movaps %xmm1, 16(%rcx) 2112; SSE2-NEXT: movdqa %xmm0, (%rcx) 2113; SSE2-NEXT: retq 2114; 2115; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16: 2116; SSE42: # %bb.0: 2117; SSE42-NEXT: movdqa (%rdi), %xmm0 2118; SSE42-NEXT: paddb (%rsi), %xmm0 2119; SSE42-NEXT: pxor %xmm1, %xmm1 2120; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 2121; SSE42-NEXT: movaps 16(%rdx), %xmm0 2122; SSE42-NEXT: paddb (%rdx), %xmm1 2123; SSE42-NEXT: movaps %xmm0, 16(%rcx) 2124; SSE42-NEXT: movdqa %xmm1, (%rcx) 2125; SSE42-NEXT: retq 2126; 2127; AVX-LABEL: vec256_v16i16_to_v1i256_factor16: 2128; AVX: # %bb.0: 2129; AVX-NEXT: vmovdqa (%rdi), %xmm0 2130; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2131; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2132; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 2133; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2134; AVX-NEXT: vmovaps 16(%rdx), %xmm1 2135; AVX-NEXT: vmovaps %xmm1, 16(%rcx) 2136; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2137; AVX-NEXT: retq 2138; 2139; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16: 2140; AVX2: # %bb.0: 2141; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2142; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2143; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 2144; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2145; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2146; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2147; AVX2-NEXT: vzeroupper 2148; AVX2-NEXT: retq 2149; 2150; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16: 2151; AVX512F: # %bb.0: 2152; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2153; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2154; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 2155; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 2156; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2157; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2158; AVX512F-NEXT: vzeroupper 2159; AVX512F-NEXT: retq 2160; 2161; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16: 2162; AVX512BW: # %bb.0: 2163; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2164; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2165; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 2166; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 2167; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2168; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2169; AVX512BW-NEXT: vzeroupper 2170; AVX512BW-NEXT: retq 2171 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2172 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2173 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2174 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2175 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> 2176 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2177 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> 2178 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2179 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2180 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2181 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2182 ret void 2183} 2184 2185define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2186; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2: 2187; SSE2: # %bb.0: 2188; SSE2-NEXT: movdqa (%rdi), %xmm0 2189; SSE2-NEXT: paddb (%rsi), %xmm0 2190; SSE2-NEXT: pxor %xmm1, %xmm1 2191; SSE2-NEXT: movdqa %xmm0, %xmm2 2192; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2193; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2194; SSE2-NEXT: paddb 16(%rdx), %xmm0 2195; SSE2-NEXT: paddb (%rdx), %xmm2 2196; SSE2-NEXT: movdqa %xmm2, (%rcx) 2197; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2198; SSE2-NEXT: retq 2199; 2200; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2: 2201; SSE42: # %bb.0: 2202; SSE42-NEXT: movdqa (%rdi), %xmm0 2203; SSE42-NEXT: paddb (%rsi), %xmm0 2204; SSE42-NEXT: pxor %xmm1, %xmm1 2205; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 2206; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2207; SSE42-NEXT: paddb 16(%rdx), %xmm0 2208; SSE42-NEXT: paddb (%rdx), %xmm2 2209; SSE42-NEXT: movdqa %xmm2, (%rcx) 2210; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2211; SSE42-NEXT: retq 2212; 2213; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: 2214; AVX: # %bb.0: 2215; AVX-NEXT: vmovdqa (%rdi), %xmm0 2216; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2217; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero 2218; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 2219; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2220; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2221; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2222; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2223; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2224; AVX-NEXT: retq 2225; 2226; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2: 2227; AVX2: # %bb.0: 2228; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2229; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2230; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2231; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2232; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2233; AVX2-NEXT: vzeroupper 2234; AVX2-NEXT: retq 2235; 2236; AVX512F-LABEL: vec256_v8i32_to_v4i64_factor2: 2237; AVX512F: # %bb.0: 2238; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2239; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2240; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2241; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2242; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2243; AVX512F-NEXT: vzeroupper 2244; AVX512F-NEXT: retq 2245; 2246; AVX512BW-LABEL: vec256_v8i32_to_v4i64_factor2: 2247; AVX512BW: # %bb.0: 2248; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 2249; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2250; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2251; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2252; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2253; AVX512BW-NEXT: vzeroupper 2254; AVX512BW-NEXT: retq 2255 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2256 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2257 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2258 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2259 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> 2260 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> 2261 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> 2262 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2263 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2264 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2265 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2266 ret void 2267} 2268 2269define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2270; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4: 2271; SSE2: # %bb.0: 2272; SSE2-NEXT: movdqa (%rdi), %xmm0 2273; SSE2-NEXT: paddb (%rsi), %xmm0 2274; SSE2-NEXT: xorps %xmm1, %xmm1 2275; SSE2-NEXT: xorps %xmm2, %xmm2 2276; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 2277; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] 2278; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 2279; SSE2-NEXT: paddb 16(%rdx), %xmm0 2280; SSE2-NEXT: paddb (%rdx), %xmm2 2281; SSE2-NEXT: movdqa %xmm2, (%rcx) 2282; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2283; SSE2-NEXT: retq 2284; 2285; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4: 2286; SSE42: # %bb.0: 2287; SSE42-NEXT: movdqa (%rdi), %xmm0 2288; SSE42-NEXT: paddb (%rsi), %xmm0 2289; SSE42-NEXT: pxor %xmm1, %xmm1 2290; SSE42-NEXT: pxor %xmm2, %xmm2 2291; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 2292; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 2293; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2294; SSE42-NEXT: paddb 16(%rdx), %xmm0 2295; SSE42-NEXT: paddb (%rdx), %xmm2 2296; SSE42-NEXT: movdqa %xmm2, (%rcx) 2297; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2298; SSE42-NEXT: retq 2299; 2300; AVX-LABEL: vec256_v8i32_to_v2i128_factor4: 2301; AVX: # %bb.0: 2302; AVX-NEXT: vmovdqa (%rdi), %xmm0 2303; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2304; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 2305; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2306; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2307; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2308; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2309; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2310; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2311; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2312; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2313; AVX-NEXT: vzeroupper 2314; AVX-NEXT: retq 2315; 2316; AVX2-SLOW-LABEL: vec256_v8i32_to_v2i128_factor4: 2317; AVX2-SLOW: # %bb.0: 2318; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 2319; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2320; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2321; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 2322; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2323; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2324; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2325; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2326; AVX2-SLOW-NEXT: vzeroupper 2327; AVX2-SLOW-NEXT: retq 2328; 2329; AVX2-FAST-PERLANE-LABEL: vec256_v8i32_to_v2i128_factor4: 2330; AVX2-FAST-PERLANE: # %bb.0: 2331; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 2332; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2333; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2334; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 2335; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 2336; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2337; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2338; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 2339; AVX2-FAST-PERLANE-NEXT: vzeroupper 2340; AVX2-FAST-PERLANE-NEXT: retq 2341; 2342; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4: 2343; AVX2-FAST: # %bb.0: 2344; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 2345; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2346; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] 2347; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2348; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 2349; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2350; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2351; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2352; AVX2-FAST-NEXT: vzeroupper 2353; AVX2-FAST-NEXT: retq 2354; 2355; AVX512F-LABEL: vec256_v8i32_to_v2i128_factor4: 2356; AVX512F: # %bb.0: 2357; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2358; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2359; AVX512F-NEXT: movb $17, %al 2360; AVX512F-NEXT: kmovw %eax, %k1 2361; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} 2362; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2363; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2364; AVX512F-NEXT: vzeroupper 2365; AVX512F-NEXT: retq 2366; 2367; AVX512BW-LABEL: vec256_v8i32_to_v2i128_factor4: 2368; AVX512BW: # %bb.0: 2369; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2370; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2371; AVX512BW-NEXT: movb $17, %al 2372; AVX512BW-NEXT: kmovd %eax, %k1 2373; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} 2374; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2375; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2376; AVX512BW-NEXT: vzeroupper 2377; AVX512BW-NEXT: retq 2378 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2379 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2380 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2381 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2382 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> 2383 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> 2384 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> 2385 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2386 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2387 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2388 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2389 ret void 2390} 2391 2392define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2393; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8: 2394; SSE2: # %bb.0: 2395; SSE2-NEXT: movdqa (%rdi), %xmm0 2396; SSE2-NEXT: paddb (%rsi), %xmm0 2397; SSE2-NEXT: xorps %xmm1, %xmm1 2398; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2399; SSE2-NEXT: movaps 16(%rdx), %xmm0 2400; SSE2-NEXT: paddb (%rdx), %xmm1 2401; SSE2-NEXT: movaps %xmm0, 16(%rcx) 2402; SSE2-NEXT: movdqa %xmm1, (%rcx) 2403; SSE2-NEXT: retq 2404; 2405; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8: 2406; SSE42: # %bb.0: 2407; SSE42-NEXT: movdqa (%rdi), %xmm0 2408; SSE42-NEXT: paddb (%rsi), %xmm0 2409; SSE42-NEXT: pxor %xmm1, %xmm1 2410; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2411; SSE42-NEXT: movaps 16(%rdx), %xmm0 2412; SSE42-NEXT: paddb (%rdx), %xmm1 2413; SSE42-NEXT: movaps %xmm0, 16(%rcx) 2414; SSE42-NEXT: movdqa %xmm1, (%rcx) 2415; SSE42-NEXT: retq 2416; 2417; AVX-LABEL: vec256_v8i32_to_v1i256_factor8: 2418; AVX: # %bb.0: 2419; AVX-NEXT: vmovdqa (%rdi), %xmm0 2420; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2421; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2422; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2423; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2424; AVX-NEXT: vmovaps 16(%rdx), %xmm1 2425; AVX-NEXT: vmovaps %xmm1, 16(%rcx) 2426; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2427; AVX-NEXT: retq 2428; 2429; AVX2-LABEL: vec256_v8i32_to_v1i256_factor8: 2430; AVX2: # %bb.0: 2431; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2432; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2433; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2434; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2435; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2436; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2437; AVX2-NEXT: vzeroupper 2438; AVX2-NEXT: retq 2439; 2440; AVX512F-LABEL: vec256_v8i32_to_v1i256_factor8: 2441; AVX512F: # %bb.0: 2442; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2443; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2444; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2445; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2446; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2447; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2448; AVX512F-NEXT: vzeroupper 2449; AVX512F-NEXT: retq 2450; 2451; AVX512BW-LABEL: vec256_v8i32_to_v1i256_factor8: 2452; AVX512BW: # %bb.0: 2453; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2454; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2455; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2456; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2457; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2458; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2459; AVX512BW-NEXT: vzeroupper 2460; AVX512BW-NEXT: retq 2461 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2462 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2463 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2464 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2465 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> 2466 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2467 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> 2468 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2469 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2470 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2471 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2472 ret void 2473} 2474 2475define void @vec256_v4i64_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2476; SSE-LABEL: vec256_v4i64_to_v2i128_factor2: 2477; SSE: # %bb.0: 2478; SSE-NEXT: movdqa (%rdi), %xmm0 2479; SSE-NEXT: paddb (%rsi), %xmm0 2480; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 2481; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 2482; SSE-NEXT: paddb 16(%rdx), %xmm0 2483; SSE-NEXT: paddb (%rdx), %xmm1 2484; SSE-NEXT: movdqa %xmm1, (%rcx) 2485; SSE-NEXT: movdqa %xmm0, 16(%rcx) 2486; SSE-NEXT: retq 2487; 2488; AVX-LABEL: vec256_v4i64_to_v2i128_factor2: 2489; AVX: # %bb.0: 2490; AVX-NEXT: vmovdqa (%rdi), %xmm0 2491; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2492; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2493; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 2494; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] 2495; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2496; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2497; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2498; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2499; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2500; AVX-NEXT: vzeroupper 2501; AVX-NEXT: retq 2502; 2503; AVX2-LABEL: vec256_v4i64_to_v2i128_factor2: 2504; AVX2: # %bb.0: 2505; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2506; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2507; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 2508; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2509; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2510; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2511; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2512; AVX2-NEXT: vzeroupper 2513; AVX2-NEXT: retq 2514; 2515; AVX512F-LABEL: vec256_v4i64_to_v2i128_factor2: 2516; AVX512F: # %bb.0: 2517; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2518; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2519; AVX512F-NEXT: movb $5, %al 2520; AVX512F-NEXT: kmovw %eax, %k1 2521; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} 2522; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2523; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2524; AVX512F-NEXT: vzeroupper 2525; AVX512F-NEXT: retq 2526; 2527; AVX512BW-LABEL: vec256_v4i64_to_v2i128_factor2: 2528; AVX512BW: # %bb.0: 2529; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2530; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2531; AVX512BW-NEXT: movb $5, %al 2532; AVX512BW-NEXT: kmovd %eax, %k1 2533; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} 2534; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2535; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2536; AVX512BW-NEXT: vzeroupper 2537; AVX512BW-NEXT: retq 2538 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2539 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2540 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2541 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2542 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> 2543 %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2544 %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> 2545 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2546 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2547 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2548 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2549 ret void 2550} 2551 2552define void @vec256_v4i64_to_v1i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2553; SSE-LABEL: vec256_v4i64_to_v1i256_factor4: 2554; SSE: # %bb.0: 2555; SSE-NEXT: movdqa (%rdi), %xmm0 2556; SSE-NEXT: paddb (%rsi), %xmm0 2557; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 2558; SSE-NEXT: movaps 16(%rdx), %xmm1 2559; SSE-NEXT: paddb (%rdx), %xmm0 2560; SSE-NEXT: movaps %xmm1, 16(%rcx) 2561; SSE-NEXT: movdqa %xmm0, (%rcx) 2562; SSE-NEXT: retq 2563; 2564; AVX-LABEL: vec256_v4i64_to_v1i256_factor4: 2565; AVX: # %bb.0: 2566; AVX-NEXT: vmovdqa (%rdi), %xmm0 2567; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2568; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2569; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2570; AVX-NEXT: vmovaps 16(%rdx), %xmm1 2571; AVX-NEXT: vmovaps %xmm1, 16(%rcx) 2572; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2573; AVX-NEXT: retq 2574; 2575; AVX2-LABEL: vec256_v4i64_to_v1i256_factor4: 2576; AVX2: # %bb.0: 2577; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2578; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2579; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2580; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2581; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2582; AVX2-NEXT: vzeroupper 2583; AVX2-NEXT: retq 2584; 2585; AVX512F-LABEL: vec256_v4i64_to_v1i256_factor4: 2586; AVX512F: # %bb.0: 2587; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2588; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2589; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2590; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2591; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2592; AVX512F-NEXT: vzeroupper 2593; AVX512F-NEXT: retq 2594; 2595; AVX512BW-LABEL: vec256_v4i64_to_v1i256_factor4: 2596; AVX512BW: # %bb.0: 2597; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2598; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2599; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2600; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2601; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2602; AVX512BW-NEXT: vzeroupper 2603; AVX512BW-NEXT: retq 2604 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2605 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2606 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2607 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2608 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> 2609 %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2610 %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> 2611 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2612 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2613 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2614 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2615 ret void 2616} 2617 2618define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2619; SSE-LABEL: vec256_v2i128_to_v1i256_factor2: 2620; SSE: # %bb.0: 2621; SSE-NEXT: movdqa (%rdi), %xmm0 2622; SSE-NEXT: paddb (%rsi), %xmm0 2623; SSE-NEXT: movaps 16(%rdx), %xmm1 2624; SSE-NEXT: paddb (%rdx), %xmm0 2625; SSE-NEXT: movaps %xmm1, 16(%rcx) 2626; SSE-NEXT: movdqa %xmm0, (%rcx) 2627; SSE-NEXT: retq 2628; 2629; AVX-LABEL: vec256_v2i128_to_v1i256_factor2: 2630; AVX: # %bb.0: 2631; AVX-NEXT: vmovdqa (%rdi), %xmm0 2632; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2633; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2634; AVX-NEXT: vmovaps 16(%rdx), %xmm1 2635; AVX-NEXT: vmovaps %xmm1, 16(%rcx) 2636; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2637; AVX-NEXT: retq 2638; 2639; AVX2-LABEL: vec256_v2i128_to_v1i256_factor2: 2640; AVX2: # %bb.0: 2641; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2642; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2643; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2644; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2645; AVX2-NEXT: vzeroupper 2646; AVX2-NEXT: retq 2647; 2648; AVX512F-LABEL: vec256_v2i128_to_v1i256_factor2: 2649; AVX512F: # %bb.0: 2650; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2651; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2652; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2653; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 2654; AVX512F-NEXT: vzeroupper 2655; AVX512F-NEXT: retq 2656; 2657; AVX512BW-LABEL: vec256_v2i128_to_v1i256_factor2: 2658; AVX512BW: # %bb.0: 2659; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 2660; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2661; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2662; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2663; AVX512BW-NEXT: vzeroupper 2664; AVX512BW-NEXT: retq 2665 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2666 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2667 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2668 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2669 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <2 x i128> 2670 %zextd.vec = shufflevector <2 x i128> %in.vec.cast, <2 x i128> zeroinitializer, <2 x i32> <i32 0, i32 3> 2671 %out.bytevec = bitcast <2 x i128> %zextd.vec to <32 x i8> 2672 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2673 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2674 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2675 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2676 ret void 2677} 2678 2679define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2680; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2: 2681; SSE2: # %bb.0: 2682; SSE2-NEXT: movdqa (%rdi), %xmm0 2683; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2684; SSE2-NEXT: paddb (%rsi), %xmm0 2685; SSE2-NEXT: paddb 16(%rsi), %xmm1 2686; SSE2-NEXT: pxor %xmm2, %xmm2 2687; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2688; SSE2-NEXT: movdqa %xmm0, %xmm3 2689; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 2690; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2691; SSE2-NEXT: paddb 16(%rdx), %xmm0 2692; SSE2-NEXT: paddb (%rdx), %xmm3 2693; SSE2-NEXT: paddb 32(%rdx), %xmm1 2694; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 2695; SSE2-NEXT: movdqa %xmm3, (%rcx) 2696; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2697; SSE2-NEXT: retq 2698; 2699; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2: 2700; SSE42: # %bb.0: 2701; SSE42-NEXT: movdqa (%rdi), %xmm0 2702; SSE42-NEXT: movdqa 16(%rdi), %xmm1 2703; SSE42-NEXT: paddb (%rsi), %xmm0 2704; SSE42-NEXT: paddb 16(%rsi), %xmm1 2705; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2706; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2707; SSE42-NEXT: pxor %xmm3, %xmm3 2708; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 2709; SSE42-NEXT: paddb 16(%rdx), %xmm0 2710; SSE42-NEXT: paddb (%rdx), %xmm2 2711; SSE42-NEXT: paddb 32(%rdx), %xmm1 2712; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2713; SSE42-NEXT: movdqa %xmm2, (%rcx) 2714; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2715; SSE42-NEXT: retq 2716; 2717; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: 2718; AVX: # %bb.0: 2719; AVX-NEXT: vmovdqa (%rdi), %xmm0 2720; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 2721; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 2722; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2723; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2724; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 2725; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 2726; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2727; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 2728; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2729; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 2730; AVX-NEXT: vmovdqa %xmm2, (%rcx) 2731; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2732; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 2733; AVX-NEXT: retq 2734; 2735; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2: 2736; AVX2: # %bb.0: 2737; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2738; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2739; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2740; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2741; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2742; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2743; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2744; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2745; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2746; AVX2-NEXT: vzeroupper 2747; AVX2-NEXT: retq 2748; 2749; AVX512F-LABEL: vec384_v48i8_to_v24i16_factor2: 2750; AVX512F: # %bb.0: 2751; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2752; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2753; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2754; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 2755; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2756; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2757; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2758; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2759; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2760; AVX512F-NEXT: vzeroupper 2761; AVX512F-NEXT: retq 2762; 2763; AVX512BW-LABEL: vec384_v48i8_to_v24i16_factor2: 2764; AVX512BW: # %bb.0: 2765; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2766; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2767; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2768; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2769; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2770; AVX512BW-NEXT: vzeroupper 2771; AVX512BW-NEXT: retq 2772 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2773 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2774 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2775 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 2776 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 1, i32 51, i32 2, i32 53, i32 3, i32 55, i32 4, i32 57, i32 5, i32 59, i32 6, i32 61, i32 7, i32 63, i32 8, i32 65, i32 9, i32 67, i32 10, i32 69, i32 11, i32 71, i32 12, i32 73, i32 13, i32 75, i32 14, i32 77, i32 15, i32 79, i32 16, i32 81, i32 17, i32 83, i32 18, i32 85, i32 19, i32 87, i32 20, i32 89, i32 21, i32 91, i32 22, i32 93, i32 23, i32 95> 2777 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2778 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2779 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2780 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2781 ret void 2782} 2783 2784define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2785; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3: 2786; SSE2: # %bb.0: 2787; SSE2-NEXT: movdqa (%rdi), %xmm0 2788; SSE2-NEXT: paddb (%rsi), %xmm0 2789; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] 2790; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] 2791; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2792; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2793; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] 2794; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 2795; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2796; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2797; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 2798; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6] 2799; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2800; SSE2-NEXT: paddb (%rdx), %xmm0 2801; SSE2-NEXT: paddb 32(%rdx), %xmm2 2802; SSE2-NEXT: paddb 16(%rdx), %xmm1 2803; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2804; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 2805; SSE2-NEXT: movdqa %xmm0, (%rcx) 2806; SSE2-NEXT: retq 2807; 2808; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3: 2809; SSE42: # %bb.0: 2810; SSE42-NEXT: movdqa (%rdi), %xmm0 2811; SSE42-NEXT: paddb (%rsi), %xmm0 2812; SSE42-NEXT: movdqa %xmm0, %xmm1 2813; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero 2814; SSE42-NEXT: movdqa %xmm0, %xmm2 2815; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5] 2816; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero 2817; SSE42-NEXT: paddb 16(%rdx), %xmm0 2818; SSE42-NEXT: paddb (%rdx), %xmm2 2819; SSE42-NEXT: paddb 32(%rdx), %xmm1 2820; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2821; SSE42-NEXT: movdqa %xmm2, (%rcx) 2822; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2823; SSE42-NEXT: retq 2824; 2825; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: 2826; AVX: # %bb.0: 2827; AVX-NEXT: vmovdqa (%rdi), %xmm0 2828; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2829; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] 2830; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero 2831; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero 2832; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 2833; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 2834; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2835; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2836; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 2837; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 2838; AVX-NEXT: retq 2839; 2840; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3: 2841; AVX2: # %bb.0: 2842; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2843; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2844; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] 2845; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero 2846; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero 2847; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2848; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2849; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2850; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2851; AVX2-NEXT: vzeroupper 2852; AVX2-NEXT: retq 2853; 2854; AVX512F-LABEL: vec384_v48i8_to_v16i24_factor3: 2855; AVX512F: # %bb.0: 2856; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2857; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2858; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] 2859; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero 2860; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero 2861; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2862; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2863; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2864; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2865; AVX512F-NEXT: vzeroupper 2866; AVX512F-NEXT: retq 2867; 2868; AVX512BW-LABEL: vec384_v48i8_to_v16i24_factor3: 2869; AVX512BW: # %bb.0: 2870; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2871; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2872; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5] 2873; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm1 2874; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2875; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero 2876; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 2877; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2878; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2879; AVX512BW-NEXT: vzeroupper 2880; AVX512BW-NEXT: retq 2881 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2882 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2883 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2884 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 2885 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 1, i32 52, i32 53, i32 2, i32 55, i32 56, i32 3, i32 58, i32 59, i32 4, i32 61, i32 62, i32 5, i32 64, i32 65, i32 6, i32 67, i32 68, i32 7, i32 70, i32 71, i32 8, i32 73, i32 74, i32 9, i32 76, i32 77, i32 10, i32 79, i32 80, i32 11, i32 82, i32 83, i32 12, i32 85, i32 86, i32 13, i32 88, i32 89, i32 14, i32 91, i32 92, i32 15, i32 94, i32 95> 2886 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2887 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2888 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2889 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2890 ret void 2891} 2892 2893define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2894; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4: 2895; SSE2: # %bb.0: 2896; SSE2-NEXT: movdqa (%rdi), %xmm0 2897; SSE2-NEXT: paddb (%rsi), %xmm0 2898; SSE2-NEXT: pxor %xmm1, %xmm1 2899; SSE2-NEXT: movdqa %xmm0, %xmm2 2900; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2901; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2902; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2903; SSE2-NEXT: movdqa %xmm0, %xmm3 2904; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 2905; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2906; SSE2-NEXT: paddb 16(%rdx), %xmm0 2907; SSE2-NEXT: paddb (%rdx), %xmm3 2908; SSE2-NEXT: paddb 32(%rdx), %xmm2 2909; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 2910; SSE2-NEXT: movdqa %xmm3, (%rcx) 2911; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2912; SSE2-NEXT: retq 2913; 2914; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4: 2915; SSE42: # %bb.0: 2916; SSE42-NEXT: movdqa (%rdi), %xmm0 2917; SSE42-NEXT: paddb (%rsi), %xmm0 2918; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2919; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2920; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 2921; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 2922; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2923; SSE42-NEXT: paddb 16(%rdx), %xmm0 2924; SSE42-NEXT: paddb 32(%rdx), %xmm2 2925; SSE42-NEXT: paddb (%rdx), %xmm1 2926; SSE42-NEXT: movdqa %xmm1, (%rcx) 2927; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 2928; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2929; SSE42-NEXT: retq 2930; 2931; AVX-LABEL: vec384_v48i8_to_v12i32_factor4: 2932; AVX: # %bb.0: 2933; AVX-NEXT: vmovdqa (%rdi), %xmm0 2934; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2935; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2936; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 2937; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 2938; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 2939; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2940; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 2941; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 2942; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2943; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2944; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 2945; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 2946; AVX-NEXT: retq 2947; 2948; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4: 2949; AVX2-SLOW: # %bb.0: 2950; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 2951; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2952; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2953; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 2954; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2955; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2956; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2957; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 2958; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 2959; AVX2-SLOW-NEXT: vzeroupper 2960; AVX2-SLOW-NEXT: retq 2961; 2962; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4: 2963; AVX2-FAST-PERLANE: # %bb.0: 2964; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 2965; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2966; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2967; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero 2968; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2969; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2970; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 2971; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 2972; AVX2-FAST-PERLANE-NEXT: vzeroupper 2973; AVX2-FAST-PERLANE-NEXT: retq 2974; 2975; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4: 2976; AVX2-FAST: # %bb.0: 2977; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 2978; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2979; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2980; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero 2981; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2982; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2983; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 2984; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 2985; AVX2-FAST-NEXT: vzeroupper 2986; AVX2-FAST-NEXT: retq 2987; 2988; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4: 2989; AVX512F: # %bb.0: 2990; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2991; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2992; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2993; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero 2994; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2995; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2996; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2997; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2998; AVX512F-NEXT: vzeroupper 2999; AVX512F-NEXT: retq 3000; 3001; AVX512BW-LABEL: vec384_v48i8_to_v12i32_factor4: 3002; AVX512BW: # %bb.0: 3003; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3004; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3005; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 3006; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3007; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3008; AVX512BW-NEXT: vzeroupper 3009; AVX512BW-NEXT: retq 3010 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3011 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3012 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3013 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3014 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 1, i32 53, i32 54, i32 55, i32 2, i32 57, i32 58, i32 59, i32 3, i32 61, i32 62, i32 63, i32 4, i32 65, i32 66, i32 67, i32 5, i32 69, i32 70, i32 71, i32 6, i32 73, i32 74, i32 75, i32 7, i32 77, i32 78, i32 79, i32 8, i32 81, i32 82, i32 83, i32 9, i32 85, i32 86, i32 87, i32 10, i32 89, i32 90, i32 91, i32 11, i32 93, i32 94, i32 95> 3015 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3016 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3017 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3018 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3019 ret void 3020} 3021 3022define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3023; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6: 3024; SSE2: # %bb.0: 3025; SSE2-NEXT: movdqa (%rdi), %xmm0 3026; SSE2-NEXT: paddb (%rsi), %xmm0 3027; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3028; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3029; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3030; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 3031; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 3032; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3033; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3034; SSE2-NEXT: paddb 16(%rdx), %xmm0 3035; SSE2-NEXT: paddb (%rdx), %xmm2 3036; SSE2-NEXT: paddb 32(%rdx), %xmm1 3037; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3038; SSE2-NEXT: movdqa %xmm2, (%rcx) 3039; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3040; SSE2-NEXT: retq 3041; 3042; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6: 3043; SSE42: # %bb.0: 3044; SSE42-NEXT: movdqa (%rdi), %xmm0 3045; SSE42-NEXT: paddb (%rsi), %xmm0 3046; SSE42-NEXT: movdqa %xmm0, %xmm1 3047; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero 3048; SSE42-NEXT: movdqa %xmm0, %xmm2 3049; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero 3050; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero 3051; SSE42-NEXT: paddb 16(%rdx), %xmm0 3052; SSE42-NEXT: paddb (%rdx), %xmm2 3053; SSE42-NEXT: paddb 32(%rdx), %xmm1 3054; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3055; SSE42-NEXT: movdqa %xmm2, (%rcx) 3056; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3057; SSE42-NEXT: retq 3058; 3059; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: 3060; AVX: # %bb.0: 3061; AVX-NEXT: vmovdqa (%rdi), %xmm0 3062; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3063; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero 3064; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero 3065; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero 3066; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3067; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3068; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3069; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3070; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3071; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3072; AVX-NEXT: retq 3073; 3074; AVX2-LABEL: vec384_v48i8_to_v8i48_factor6: 3075; AVX2: # %bb.0: 3076; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3077; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3078; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero 3079; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] 3080; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] 3081; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3082; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3083; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3084; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 3085; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3086; AVX2-NEXT: vzeroupper 3087; AVX2-NEXT: retq 3088; 3089; AVX512F-LABEL: vec384_v48i8_to_v8i48_factor6: 3090; AVX512F: # %bb.0: 3091; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3092; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3093; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero 3094; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] 3095; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] 3096; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3097; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3098; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3099; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 3100; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3101; AVX512F-NEXT: vzeroupper 3102; AVX512F-NEXT: retq 3103; 3104; AVX512BW-LABEL: vec384_v48i8_to_v8i48_factor6: 3105; AVX512BW: # %bb.0: 3106; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3107; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3108; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3109; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] 3110; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 3111; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 3112; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3113; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero 3114; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3115; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3116; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3117; AVX512BW-NEXT: vzeroupper 3118; AVX512BW-NEXT: retq 3119 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3120 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3121 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3122 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3123 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 1, i32 55, i32 56, i32 57, i32 58, i32 59, i32 2, i32 61, i32 62, i32 63, i32 64, i32 65, i32 3, i32 67, i32 68, i32 69, i32 70, i32 71, i32 4, i32 73, i32 74, i32 75, i32 76, i32 77, i32 5, i32 79, i32 80, i32 81, i32 82, i32 83, i32 6, i32 85, i32 86, i32 87, i32 88, i32 89, i32 7, i32 91, i32 92, i32 93, i32 94, i32 95> 3124 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3125 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3126 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3127 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3128 ret void 3129} 3130 3131define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3132; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8: 3133; SSE2: # %bb.0: 3134; SSE2-NEXT: movdqa (%rdi), %xmm0 3135; SSE2-NEXT: paddb (%rsi), %xmm0 3136; SSE2-NEXT: pxor %xmm1, %xmm1 3137; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3138; SSE2-NEXT: movdqa %xmm0, %xmm2 3139; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3140; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3141; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3142; SSE2-NEXT: movdqa %xmm0, %xmm3 3143; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 3144; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3145; SSE2-NEXT: paddb 16(%rdx), %xmm0 3146; SSE2-NEXT: paddb (%rdx), %xmm3 3147; SSE2-NEXT: paddb 32(%rdx), %xmm2 3148; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 3149; SSE2-NEXT: movdqa %xmm3, (%rcx) 3150; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3151; SSE2-NEXT: retq 3152; 3153; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8: 3154; SSE42: # %bb.0: 3155; SSE42-NEXT: movdqa (%rdi), %xmm0 3156; SSE42-NEXT: paddb (%rsi), %xmm0 3157; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3158; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 3159; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 3160; SSE42-NEXT: psrld $16, %xmm0 3161; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3162; SSE42-NEXT: paddb 16(%rdx), %xmm0 3163; SSE42-NEXT: paddb 32(%rdx), %xmm2 3164; SSE42-NEXT: paddb (%rdx), %xmm1 3165; SSE42-NEXT: movdqa %xmm1, (%rcx) 3166; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3167; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3168; SSE42-NEXT: retq 3169; 3170; AVX-LABEL: vec384_v48i8_to_v6i64_factor8: 3171; AVX: # %bb.0: 3172; AVX-NEXT: vmovdqa (%rdi), %xmm0 3173; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3174; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3175; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 3176; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 3177; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3178; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3179; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3180; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3181; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3182; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3183; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3184; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3185; AVX-NEXT: retq 3186; 3187; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8: 3188; AVX2-SLOW: # %bb.0: 3189; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 3190; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3191; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 3192; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 3193; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3194; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3195; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3196; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3197; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3198; AVX2-SLOW-NEXT: vzeroupper 3199; AVX2-SLOW-NEXT: retq 3200; 3201; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8: 3202; AVX2-FAST-PERLANE: # %bb.0: 3203; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 3204; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3205; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 3206; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero 3207; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3208; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3209; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 3210; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 3211; AVX2-FAST-PERLANE-NEXT: vzeroupper 3212; AVX2-FAST-PERLANE-NEXT: retq 3213; 3214; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8: 3215; AVX2-FAST: # %bb.0: 3216; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 3217; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3218; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 3219; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero 3220; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3221; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3222; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3223; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3224; AVX2-FAST-NEXT: vzeroupper 3225; AVX2-FAST-NEXT: retq 3226; 3227; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8: 3228; AVX512F: # %bb.0: 3229; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3230; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3231; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 3232; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero 3233; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3234; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3235; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3236; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3237; AVX512F-NEXT: vzeroupper 3238; AVX512F-NEXT: retq 3239; 3240; AVX512BW-LABEL: vec384_v48i8_to_v6i64_factor8: 3241; AVX512BW: # %bb.0: 3242; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3243; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3244; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero 3245; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3246; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3247; AVX512BW-NEXT: vzeroupper 3248; AVX512BW-NEXT: retq 3249 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3250 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3251 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3252 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3253 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 1, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 2, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 3, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 5, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3254 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3255 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3256 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3257 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3258 ret void 3259} 3260 3261define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3262; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12: 3263; SSE2: # %bb.0: 3264; SSE2-NEXT: movdqa (%rdi), %xmm0 3265; SSE2-NEXT: paddb (%rsi), %xmm0 3266; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 3267; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3268; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 3269; SSE2-NEXT: movdqa %xmm0, %xmm2 3270; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 3271; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3272; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 3273; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3274; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 3275; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 3276; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3277; SSE2-NEXT: paddb (%rdx), %xmm0 3278; SSE2-NEXT: paddb 16(%rdx), %xmm2 3279; SSE2-NEXT: paddb 32(%rdx), %xmm1 3280; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3281; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 3282; SSE2-NEXT: movdqa %xmm0, (%rcx) 3283; SSE2-NEXT: retq 3284; 3285; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12: 3286; SSE42: # %bb.0: 3287; SSE42-NEXT: movdqa (%rdi), %xmm0 3288; SSE42-NEXT: paddb (%rsi), %xmm0 3289; SSE42-NEXT: movdqa %xmm0, %xmm1 3290; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3291; SSE42-NEXT: movdqa %xmm0, %xmm2 3292; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero 3293; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero 3294; SSE42-NEXT: paddb 16(%rdx), %xmm0 3295; SSE42-NEXT: paddb (%rdx), %xmm2 3296; SSE42-NEXT: paddb 32(%rdx), %xmm1 3297; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3298; SSE42-NEXT: movdqa %xmm2, (%rcx) 3299; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3300; SSE42-NEXT: retq 3301; 3302; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: 3303; AVX: # %bb.0: 3304; AVX-NEXT: vmovdqa (%rdi), %xmm0 3305; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3306; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero 3307; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero 3308; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3309; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3310; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3311; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3312; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3313; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3314; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3315; AVX-NEXT: retq 3316; 3317; AVX2-LABEL: vec384_v48i8_to_v4i96_factor12: 3318; AVX2: # %bb.0: 3319; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3320; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3321; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3322; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3323; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 3324; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3325; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3326; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3327; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 3328; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3329; AVX2-NEXT: vzeroupper 3330; AVX2-NEXT: retq 3331; 3332; AVX512F-LABEL: vec384_v48i8_to_v4i96_factor12: 3333; AVX512F: # %bb.0: 3334; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3335; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3336; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3337; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3338; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 3339; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3340; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3341; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3342; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 3343; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3344; AVX512F-NEXT: vzeroupper 3345; AVX512F-NEXT: retq 3346; 3347; AVX512BW-LABEL: vec384_v48i8_to_v4i96_factor12: 3348; AVX512BW: # %bb.0: 3349; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3350; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3351; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3352; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] 3353; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3354; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3355; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3356; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3357; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3358; AVX512BW-NEXT: vzeroupper 3359; AVX512BW-NEXT: retq 3360 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3361 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3362 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3363 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3364 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 1, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 3, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3365 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3366 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3367 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3368 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3369 ret void 3370} 3371 3372define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3373; SSE2-LABEL: vec384_v48i8_to_v3i128_factor16: 3374; SSE2: # %bb.0: 3375; SSE2-NEXT: movdqa (%rdi), %xmm0 3376; SSE2-NEXT: paddb (%rsi), %xmm0 3377; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] 3378; SSE2-NEXT: pand %xmm0, %xmm1 3379; SSE2-NEXT: movdqa %xmm0, %xmm2 3380; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 3381; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3382; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 3383; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3384; SSE2-NEXT: paddb 16(%rdx), %xmm0 3385; SSE2-NEXT: paddb 32(%rdx), %xmm2 3386; SSE2-NEXT: paddb (%rdx), %xmm1 3387; SSE2-NEXT: movdqa %xmm1, (%rcx) 3388; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 3389; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3390; SSE2-NEXT: retq 3391; 3392; SSE42-LABEL: vec384_v48i8_to_v3i128_factor16: 3393; SSE42: # %bb.0: 3394; SSE42-NEXT: movdqa (%rdi), %xmm0 3395; SSE42-NEXT: paddb (%rsi), %xmm0 3396; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] 3397; SSE42-NEXT: pand %xmm0, %xmm1 3398; SSE42-NEXT: movdqa %xmm0, %xmm2 3399; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 3400; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3401; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 3402; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3403; SSE42-NEXT: paddb 16(%rdx), %xmm0 3404; SSE42-NEXT: paddb 32(%rdx), %xmm2 3405; SSE42-NEXT: paddb (%rdx), %xmm1 3406; SSE42-NEXT: movdqa %xmm1, (%rcx) 3407; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3408; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3409; SSE42-NEXT: retq 3410; 3411; AVX-LABEL: vec384_v48i8_to_v3i128_factor16: 3412; AVX: # %bb.0: 3413; AVX-NEXT: vmovdqa (%rdi), %xmm0 3414; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3415; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 3416; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 3417; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3418; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] 3419; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3420; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3421; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3422; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3423; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3424; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3425; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3426; AVX-NEXT: retq 3427; 3428; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16: 3429; AVX2-SLOW: # %bb.0: 3430; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 3431; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3432; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] 3433; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3434; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3435; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 3436; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3437; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3438; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3439; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 3440; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 3441; AVX2-SLOW-NEXT: vzeroupper 3442; AVX2-SLOW-NEXT: retq 3443; 3444; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16: 3445; AVX2-FAST-PERLANE: # %bb.0: 3446; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 3447; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3448; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3449; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3450; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 3451; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3452; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3453; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3454; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) 3455; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 3456; AVX2-FAST-PERLANE-NEXT: vzeroupper 3457; AVX2-FAST-PERLANE-NEXT: retq 3458; 3459; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16: 3460; AVX2-FAST: # %bb.0: 3461; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 3462; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3463; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3464; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3465; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 3466; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3467; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3468; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3469; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 3470; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 3471; AVX2-FAST-NEXT: vzeroupper 3472; AVX2-FAST-NEXT: retq 3473; 3474; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16: 3475; AVX512F: # %bb.0: 3476; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3477; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3478; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3479; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3480; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 3481; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3482; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3483; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3484; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 3485; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3486; AVX512F-NEXT: vzeroupper 3487; AVX512F-NEXT: retq 3488; 3489; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16: 3490; AVX512BW: # %bb.0: 3491; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3492; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3493; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3494; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 3495; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3496; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3497; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3498; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3499; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3500; AVX512BW-NEXT: vzeroupper 3501; AVX512BW-NEXT: retq 3502 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3503 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3504 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3505 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3506 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 1, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3507 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3508 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3509 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3510 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3511 ret void 3512} 3513 3514define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3515; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24: 3516; SSE2: # %bb.0: 3517; SSE2-NEXT: movdqa (%rdi), %xmm0 3518; SSE2-NEXT: paddb (%rsi), %xmm0 3519; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] 3520; SSE2-NEXT: pand %xmm0, %xmm1 3521; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 3522; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3523; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 3524; SSE2-NEXT: movaps 32(%rdx), %xmm2 3525; SSE2-NEXT: paddb 16(%rdx), %xmm0 3526; SSE2-NEXT: paddb (%rdx), %xmm1 3527; SSE2-NEXT: movaps %xmm2, 32(%rcx) 3528; SSE2-NEXT: movdqa %xmm1, (%rcx) 3529; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3530; SSE2-NEXT: retq 3531; 3532; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24: 3533; SSE42: # %bb.0: 3534; SSE42-NEXT: movdqa (%rdi), %xmm0 3535; SSE42-NEXT: paddb (%rsi), %xmm0 3536; SSE42-NEXT: movdqa %xmm0, %xmm1 3537; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 3538; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3539; SSE42-NEXT: movaps 32(%rdx), %xmm2 3540; SSE42-NEXT: paddb (%rdx), %xmm0 3541; SSE42-NEXT: paddb 16(%rdx), %xmm1 3542; SSE42-NEXT: movaps %xmm2, 32(%rcx) 3543; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 3544; SSE42-NEXT: movdqa %xmm0, (%rcx) 3545; SSE42-NEXT: retq 3546; 3547; AVX-LABEL: vec384_v48i8_to_v2i192_factor24: 3548; AVX: # %bb.0: 3549; AVX-NEXT: vmovdqa (%rdi), %xmm0 3550; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3551; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 3552; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3553; AVX-NEXT: vmovaps 32(%rdx), %ymm2 3554; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3555; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3556; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 3557; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3558; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3559; AVX-NEXT: vzeroupper 3560; AVX-NEXT: retq 3561; 3562; AVX2-LABEL: vec384_v48i8_to_v2i192_factor24: 3563; AVX2: # %bb.0: 3564; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3565; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3566; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3567; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 3568; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3569; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 3570; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3571; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 3572; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3573; AVX2-NEXT: vzeroupper 3574; AVX2-NEXT: retq 3575; 3576; AVX512F-LABEL: vec384_v48i8_to_v2i192_factor24: 3577; AVX512F: # %bb.0: 3578; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3579; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3580; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3581; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 3582; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3583; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3584; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 3585; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 3586; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3587; AVX512F-NEXT: vzeroupper 3588; AVX512F-NEXT: retq 3589; 3590; AVX512BW-LABEL: vec384_v48i8_to_v2i192_factor24: 3591; AVX512BW: # %bb.0: 3592; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 3593; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3594; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 3595; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 3596; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3597; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3598; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3599; AVX512BW-NEXT: vzeroupper 3600; AVX512BW-NEXT: retq 3601 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3602 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3603 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3604 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3605 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3606 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3607 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3608 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3609 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3610 ret void 3611} 3612 3613define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3614; SSE-LABEL: vec384_v48i8_to_v1i384_factor48: 3615; SSE: # %bb.0: 3616; SSE-NEXT: movdqa (%rdi), %xmm0 3617; SSE-NEXT: paddb (%rsi), %xmm0 3618; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3619; SSE-NEXT: movaps 16(%rdx), %xmm1 3620; SSE-NEXT: movaps 32(%rdx), %xmm2 3621; SSE-NEXT: paddb (%rdx), %xmm0 3622; SSE-NEXT: movaps %xmm1, 16(%rcx) 3623; SSE-NEXT: movaps %xmm2, 32(%rcx) 3624; SSE-NEXT: movdqa %xmm0, (%rcx) 3625; SSE-NEXT: retq 3626; 3627; AVX-LABEL: vec384_v48i8_to_v1i384_factor48: 3628; AVX: # %bb.0: 3629; AVX-NEXT: vmovdqa (%rdi), %xmm0 3630; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3631; AVX-NEXT: vmovaps 32(%rdx), %ymm1 3632; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3633; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 3634; AVX-NEXT: vmovaps 16(%rdx), %xmm2 3635; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 3636; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 3637; AVX-NEXT: vmovdqa %xmm0, (%rcx) 3638; AVX-NEXT: vzeroupper 3639; AVX-NEXT: retq 3640; 3641; AVX2-LABEL: vec384_v48i8_to_v1i384_factor48: 3642; AVX2: # %bb.0: 3643; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3644; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3645; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 3646; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3647; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 3648; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3649; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 3650; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3651; AVX2-NEXT: vzeroupper 3652; AVX2-NEXT: retq 3653; 3654; AVX512F-LABEL: vec384_v48i8_to_v1i384_factor48: 3655; AVX512F: # %bb.0: 3656; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 3657; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3658; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 3659; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 3660; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3661; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 3662; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 3663; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3664; AVX512F-NEXT: vzeroupper 3665; AVX512F-NEXT: retq 3666; 3667; AVX512BW-LABEL: vec384_v48i8_to_v1i384_factor48: 3668; AVX512BW: # %bb.0: 3669; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3670; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3671; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 3672; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3673; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3674; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3675; AVX512BW-NEXT: vzeroupper 3676; AVX512BW-NEXT: retq 3677 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3678 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3679 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3680 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3681 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3682 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3683 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3684 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3685 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3686 ret void 3687} 3688 3689define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3690; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2: 3691; SSE2: # %bb.0: 3692; SSE2-NEXT: movdqa (%rdi), %xmm0 3693; SSE2-NEXT: movdqa 16(%rdi), %xmm1 3694; SSE2-NEXT: paddb (%rsi), %xmm0 3695; SSE2-NEXT: paddb 16(%rsi), %xmm1 3696; SSE2-NEXT: pxor %xmm2, %xmm2 3697; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 3698; SSE2-NEXT: movdqa %xmm0, %xmm3 3699; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3700; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 3701; SSE2-NEXT: paddb 16(%rdx), %xmm0 3702; SSE2-NEXT: paddb (%rdx), %xmm3 3703; SSE2-NEXT: paddb 32(%rdx), %xmm1 3704; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3705; SSE2-NEXT: movdqa %xmm3, (%rcx) 3706; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3707; SSE2-NEXT: retq 3708; 3709; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2: 3710; SSE42: # %bb.0: 3711; SSE42-NEXT: movdqa (%rdi), %xmm0 3712; SSE42-NEXT: movdqa 16(%rdi), %xmm1 3713; SSE42-NEXT: paddb (%rsi), %xmm0 3714; SSE42-NEXT: paddb 16(%rsi), %xmm1 3715; SSE42-NEXT: pxor %xmm2, %xmm2 3716; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 3717; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3718; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 3719; SSE42-NEXT: paddb 16(%rdx), %xmm0 3720; SSE42-NEXT: paddb (%rdx), %xmm3 3721; SSE42-NEXT: paddb 32(%rdx), %xmm1 3722; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3723; SSE42-NEXT: movdqa %xmm3, (%rcx) 3724; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3725; SSE42-NEXT: retq 3726; 3727; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: 3728; AVX: # %bb.0: 3729; AVX-NEXT: vmovdqa (%rdi), %xmm0 3730; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 3731; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 3732; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3733; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3734; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 3735; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 3736; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 3737; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 3738; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3739; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 3740; AVX-NEXT: vmovdqa %xmm2, (%rcx) 3741; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3742; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 3743; AVX-NEXT: retq 3744; 3745; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2: 3746; AVX2: # %bb.0: 3747; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3748; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3749; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3750; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 3751; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3752; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3753; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3754; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3755; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3756; AVX2-NEXT: vzeroupper 3757; AVX2-NEXT: retq 3758; 3759; AVX512F-LABEL: vec384_v24i16_to_v12i32_factor2: 3760; AVX512F: # %bb.0: 3761; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 3762; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3763; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3764; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 3765; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3766; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3767; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3768; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3769; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3770; AVX512F-NEXT: vzeroupper 3771; AVX512F-NEXT: retq 3772; 3773; AVX512BW-LABEL: vec384_v24i16_to_v12i32_factor2: 3774; AVX512BW: # %bb.0: 3775; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 3776; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3777; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3778; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 3779; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3780; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3781; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3782; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3783; AVX512BW-NEXT: vzeroupper 3784; AVX512BW-NEXT: retq 3785 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3786 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3787 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3788 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3789 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 3790 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 1, i32 27, i32 2, i32 29, i32 3, i32 31, i32 4, i32 33, i32 5, i32 35, i32 6, i32 37, i32 7, i32 39, i32 8, i32 41, i32 9, i32 43, i32 10, i32 45, i32 11, i32 47> 3791 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 3792 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3793 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3794 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3795 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3796 ret void 3797} 3798 3799define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3800; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3: 3801; SSE2: # %bb.0: 3802; SSE2-NEXT: movdqa (%rdi), %xmm0 3803; SSE2-NEXT: paddb (%rsi), %xmm0 3804; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3805; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3806; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 3807; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 3808; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3809; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3810; SSE2-NEXT: paddb 16(%rdx), %xmm0 3811; SSE2-NEXT: paddb (%rdx), %xmm2 3812; SSE2-NEXT: paddb 32(%rdx), %xmm1 3813; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3814; SSE2-NEXT: movdqa %xmm2, (%rcx) 3815; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3816; SSE2-NEXT: retq 3817; 3818; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3: 3819; SSE42: # %bb.0: 3820; SSE42-NEXT: movdqa (%rdi), %xmm0 3821; SSE42-NEXT: paddb (%rsi), %xmm0 3822; SSE42-NEXT: pxor %xmm1, %xmm1 3823; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 3824; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] 3825; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] 3826; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] 3827; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3828; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] 3829; SSE42-NEXT: paddb 16(%rdx), %xmm0 3830; SSE42-NEXT: paddb (%rdx), %xmm3 3831; SSE42-NEXT: paddb 32(%rdx), %xmm2 3832; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3833; SSE42-NEXT: movdqa %xmm3, (%rcx) 3834; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3835; SSE42-NEXT: retq 3836; 3837; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: 3838; AVX: # %bb.0: 3839; AVX-NEXT: vmovdqa (%rdi), %xmm0 3840; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3841; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 3842; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 3843; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 3844; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] 3845; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] 3846; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3847; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3848; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3849; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2 3850; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3851; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3852; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3853; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3854; AVX-NEXT: retq 3855; 3856; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: 3857; AVX2-SLOW: # %bb.0: 3858; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 3859; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3860; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] 3861; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 3862; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3863; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3864; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3865; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3866; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3867; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3868; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3869; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3870; AVX2-SLOW-NEXT: vzeroupper 3871; AVX2-SLOW-NEXT: retq 3872; 3873; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v8i48_factor3: 3874; AVX2-FAST-PERLANE: # %bb.0: 3875; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 3876; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3877; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] 3878; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 3879; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3880; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero 3881; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3882; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3883; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 3884; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 3885; AVX2-FAST-PERLANE-NEXT: vzeroupper 3886; AVX2-FAST-PERLANE-NEXT: retq 3887; 3888; AVX2-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: 3889; AVX2-FAST: # %bb.0: 3890; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 3891; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3892; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] 3893; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 3894; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3895; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero 3896; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3897; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3898; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3899; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3900; AVX2-FAST-NEXT: vzeroupper 3901; AVX2-FAST-NEXT: retq 3902; 3903; AVX512F-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: 3904; AVX512F-SLOW: # %bb.0: 3905; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 3906; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3907; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] 3908; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 3909; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3910; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3911; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3912; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3913; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3914; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3915; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3916; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3917; AVX512F-SLOW-NEXT: vzeroupper 3918; AVX512F-SLOW-NEXT: retq 3919; 3920; AVX512F-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: 3921; AVX512F-FAST: # %bb.0: 3922; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 3923; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3924; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] 3925; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 3926; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3927; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero 3928; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3929; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3930; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3931; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3932; AVX512F-FAST-NEXT: vzeroupper 3933; AVX512F-FAST-NEXT: retq 3934; 3935; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: 3936; AVX512BW-SLOW: # %bb.0: 3937; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 3938; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3939; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] 3940; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3941; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 3942; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3943; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3944; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] 3945; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 3946; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3947; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 3948; AVX512BW-SLOW-NEXT: vzeroupper 3949; AVX512BW-SLOW-NEXT: retq 3950; 3951; AVX512BW-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: 3952; AVX512BW-FAST: # %bb.0: 3953; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 3954; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3955; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] 3956; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 3957; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 3958; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero 3959; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 3960; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3961; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 3962; AVX512BW-FAST-NEXT: vzeroupper 3963; AVX512BW-FAST-NEXT: retq 3964 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3965 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3966 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3967 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3968 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 3969 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 1, i32 28, i32 29, i32 2, i32 31, i32 32, i32 3, i32 34, i32 35, i32 4, i32 37, i32 38, i32 5, i32 40, i32 41, i32 6, i32 43, i32 44, i32 7, i32 46, i32 47> 3970 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 3971 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3972 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3973 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3974 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3975 ret void 3976} 3977 3978define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3979; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4: 3980; SSE2: # %bb.0: 3981; SSE2-NEXT: movdqa (%rdi), %xmm0 3982; SSE2-NEXT: paddb (%rsi), %xmm0 3983; SSE2-NEXT: pxor %xmm1, %xmm1 3984; SSE2-NEXT: movdqa %xmm0, %xmm2 3985; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3986; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3987; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3988; SSE2-NEXT: movdqa %xmm0, %xmm3 3989; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 3990; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3991; SSE2-NEXT: paddb 16(%rdx), %xmm0 3992; SSE2-NEXT: paddb (%rdx), %xmm3 3993; SSE2-NEXT: paddb 32(%rdx), %xmm2 3994; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 3995; SSE2-NEXT: movdqa %xmm3, (%rcx) 3996; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3997; SSE2-NEXT: retq 3998; 3999; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4: 4000; SSE42: # %bb.0: 4001; SSE42-NEXT: movdqa (%rdi), %xmm0 4002; SSE42-NEXT: paddb (%rsi), %xmm0 4003; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4004; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 4005; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4006; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 4007; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4008; SSE42-NEXT: paddb 16(%rdx), %xmm0 4009; SSE42-NEXT: paddb 32(%rdx), %xmm2 4010; SSE42-NEXT: paddb (%rdx), %xmm1 4011; SSE42-NEXT: movdqa %xmm1, (%rcx) 4012; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 4013; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4014; SSE42-NEXT: retq 4015; 4016; AVX-LABEL: vec384_v24i16_to_v6i64_factor4: 4017; AVX: # %bb.0: 4018; AVX-NEXT: vmovdqa (%rdi), %xmm0 4019; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4020; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4021; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 4022; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 4023; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4024; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4025; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4026; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4027; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4028; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4029; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4030; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4031; AVX-NEXT: retq 4032; 4033; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: 4034; AVX2-SLOW: # %bb.0: 4035; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4036; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4037; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4038; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4039; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4040; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4041; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4042; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4043; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4044; AVX2-SLOW-NEXT: vzeroupper 4045; AVX2-SLOW-NEXT: retq 4046; 4047; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4: 4048; AVX2-FAST-PERLANE: # %bb.0: 4049; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 4050; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4051; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4052; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero 4053; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4054; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4055; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 4056; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 4057; AVX2-FAST-PERLANE-NEXT: vzeroupper 4058; AVX2-FAST-PERLANE-NEXT: retq 4059; 4060; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4: 4061; AVX2-FAST: # %bb.0: 4062; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 4063; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4064; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4065; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero 4066; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4067; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4068; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4069; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4070; AVX2-FAST-NEXT: vzeroupper 4071; AVX2-FAST-NEXT: retq 4072; 4073; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4: 4074; AVX512F: # %bb.0: 4075; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 4076; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4077; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4078; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero 4079; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4080; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4081; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4082; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4083; AVX512F-NEXT: vzeroupper 4084; AVX512F-NEXT: retq 4085; 4086; AVX512BW-LABEL: vec384_v24i16_to_v6i64_factor4: 4087; AVX512BW: # %bb.0: 4088; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 4089; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4090; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4091; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero 4092; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4093; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4094; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4095; AVX512BW-NEXT: vzeroupper 4096; AVX512BW-NEXT: retq 4097 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4098 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4099 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4100 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4101 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 4102 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 1, i32 29, i32 30, i32 31, i32 2, i32 33, i32 34, i32 35, i32 3, i32 37, i32 38, i32 39, i32 4, i32 41, i32 42, i32 43, i32 5, i32 45, i32 46, i32 47> 4103 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 4104 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4105 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4106 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4107 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4108 ret void 4109} 4110 4111define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4112; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6: 4113; SSE2: # %bb.0: 4114; SSE2-NEXT: movdqa (%rdi), %xmm0 4115; SSE2-NEXT: paddb (%rsi), %xmm0 4116; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 4117; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4118; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 4119; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] 4120; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 4121; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 4122; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 4123; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4124; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4125; SSE2-NEXT: paddb 16(%rdx), %xmm0 4126; SSE2-NEXT: paddb (%rdx), %xmm2 4127; SSE2-NEXT: paddb 32(%rdx), %xmm1 4128; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 4129; SSE2-NEXT: movdqa %xmm2, (%rcx) 4130; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4131; SSE2-NEXT: retq 4132; 4133; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6: 4134; SSE42: # %bb.0: 4135; SSE42-NEXT: movdqa (%rdi), %xmm0 4136; SSE42-NEXT: paddb (%rsi), %xmm0 4137; SSE42-NEXT: pxor %xmm1, %xmm1 4138; SSE42-NEXT: movdqa %xmm0, %xmm2 4139; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero 4140; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 4141; SSE42-NEXT: psrld $16, %xmm0 4142; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] 4143; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7] 4144; SSE42-NEXT: paddb 16(%rdx), %xmm3 4145; SSE42-NEXT: paddb 32(%rdx), %xmm0 4146; SSE42-NEXT: paddb (%rdx), %xmm2 4147; SSE42-NEXT: movdqa %xmm2, (%rcx) 4148; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4149; SSE42-NEXT: movdqa %xmm3, 16(%rcx) 4150; SSE42-NEXT: retq 4151; 4152; AVX-LABEL: vec384_v24i16_to_v4i96_factor6: 4153; AVX: # %bb.0: 4154; AVX-NEXT: vmovdqa (%rdi), %xmm0 4155; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4156; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero 4157; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 4158; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4159; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7] 4160; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 4161; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] 4162; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4163; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4164; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4165; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4166; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4167; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4168; AVX-NEXT: retq 4169; 4170; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: 4171; AVX2-SLOW: # %bb.0: 4172; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4173; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4174; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 4175; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4176; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] 4177; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4178; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 4179; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4180; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4181; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4182; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 4183; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 4184; AVX2-SLOW-NEXT: vzeroupper 4185; AVX2-SLOW-NEXT: retq 4186; 4187; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v4i96_factor6: 4188; AVX2-FAST-PERLANE: # %bb.0: 4189; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 4190; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4191; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4192; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4193; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 4194; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4195; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4196; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4197; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) 4198; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 4199; AVX2-FAST-PERLANE-NEXT: vzeroupper 4200; AVX2-FAST-PERLANE-NEXT: retq 4201; 4202; AVX2-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: 4203; AVX2-FAST: # %bb.0: 4204; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 4205; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4206; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4207; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4208; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 4209; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4210; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4211; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4212; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 4213; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 4214; AVX2-FAST-NEXT: vzeroupper 4215; AVX2-FAST-NEXT: retq 4216; 4217; AVX512F-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: 4218; AVX512F-SLOW: # %bb.0: 4219; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4220; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4221; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 4222; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4223; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] 4224; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4225; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 4226; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4227; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4228; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4229; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 4230; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 4231; AVX512F-SLOW-NEXT: vzeroupper 4232; AVX512F-SLOW-NEXT: retq 4233; 4234; AVX512F-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: 4235; AVX512F-FAST: # %bb.0: 4236; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 4237; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4238; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4239; AVX512F-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4240; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 4241; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4242; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4243; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4244; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 4245; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 4246; AVX512F-FAST-NEXT: vzeroupper 4247; AVX512F-FAST-NEXT: retq 4248; 4249; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: 4250; AVX512BW-SLOW: # %bb.0: 4251; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 4252; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4253; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] 4254; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4255; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 4256; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 4257; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 4258; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] 4259; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 4260; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4261; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4262; AVX512BW-SLOW-NEXT: vzeroupper 4263; AVX512BW-SLOW-NEXT: retq 4264; 4265; AVX512BW-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: 4266; AVX512BW-FAST: # %bb.0: 4267; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 4268; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4269; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] 4270; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4271; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 4272; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4273; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 4274; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4275; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4276; AVX512BW-FAST-NEXT: vzeroupper 4277; AVX512BW-FAST-NEXT: retq 4278 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4279 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4280 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4281 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4282 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 4283 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 1, i32 31, i32 32, i32 33, i32 34, i32 35, i32 2, i32 37, i32 38, i32 39, i32 40, i32 41, i32 3, i32 43, i32 44, i32 45, i32 46, i32 47> 4284 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 4285 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4286 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4287 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4288 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4289 ret void 4290} 4291 4292define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4293; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8: 4294; SSE2: # %bb.0: 4295; SSE2-NEXT: movdqa (%rdi), %xmm0 4296; SSE2-NEXT: paddb (%rsi), %xmm0 4297; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 4298; SSE2-NEXT: pand %xmm0, %xmm1 4299; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4300; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 4301; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4302; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4303; SSE2-NEXT: paddb 16(%rdx), %xmm2 4304; SSE2-NEXT: paddb 32(%rdx), %xmm0 4305; SSE2-NEXT: paddb (%rdx), %xmm1 4306; SSE2-NEXT: movdqa %xmm1, (%rcx) 4307; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4308; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 4309; SSE2-NEXT: retq 4310; 4311; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8: 4312; SSE42: # %bb.0: 4313; SSE42-NEXT: movdqa (%rdi), %xmm0 4314; SSE42-NEXT: paddb (%rsi), %xmm0 4315; SSE42-NEXT: pxor %xmm1, %xmm1 4316; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4317; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4318; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 4319; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4320; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4321; SSE42-NEXT: paddb 16(%rdx), %xmm2 4322; SSE42-NEXT: paddb 32(%rdx), %xmm0 4323; SSE42-NEXT: paddb (%rdx), %xmm1 4324; SSE42-NEXT: movdqa %xmm1, (%rcx) 4325; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4326; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4327; SSE42-NEXT: retq 4328; 4329; AVX-LABEL: vec384_v24i16_to_v3i128_factor8: 4330; AVX: # %bb.0: 4331; AVX-NEXT: vmovdqa (%rdi), %xmm0 4332; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4333; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4334; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4335; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4336; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4337; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 4338; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4339; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4340; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4341; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4342; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4343; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4344; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4345; AVX-NEXT: retq 4346; 4347; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: 4348; AVX2-SLOW: # %bb.0: 4349; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 4350; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 4351; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 4352; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] 4353; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4354; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4355; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 4356; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 4357; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4358; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 4359; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 4360; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 4361; AVX2-SLOW-NEXT: vzeroupper 4362; AVX2-SLOW-NEXT: retq 4363; 4364; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: 4365; AVX2-FAST-PERLANE: # %bb.0: 4366; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 4367; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 4368; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 4369; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4370; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4371; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 4372; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 4373; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4374; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 4375; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) 4376; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 4377; AVX2-FAST-PERLANE-NEXT: vzeroupper 4378; AVX2-FAST-PERLANE-NEXT: retq 4379; 4380; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: 4381; AVX2-FAST: # %bb.0: 4382; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 4383; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 4384; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 4385; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4386; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 4387; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 4388; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 4389; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4390; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 4391; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 4392; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 4393; AVX2-FAST-NEXT: vzeroupper 4394; AVX2-FAST-NEXT: retq 4395; 4396; AVX512F-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: 4397; AVX512F-SLOW: # %bb.0: 4398; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4399; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4400; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 4401; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4402; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 4403; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4404; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 4405; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4406; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 4407; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4408; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4409; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 4410; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 4411; AVX512F-SLOW-NEXT: vzeroupper 4412; AVX512F-SLOW-NEXT: retq 4413; 4414; AVX512F-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: 4415; AVX512F-FAST: # %bb.0: 4416; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 4417; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4418; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4419; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4420; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 4421; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4422; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 4423; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4424; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4425; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 4426; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 4427; AVX512F-FAST-NEXT: vzeroupper 4428; AVX512F-FAST-NEXT: retq 4429; 4430; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: 4431; AVX512BW-SLOW: # %bb.0: 4432; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 4433; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4434; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] 4435; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4436; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 4437; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 4438; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 4439; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4440; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 4441; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4442; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4443; AVX512BW-SLOW-NEXT: vzeroupper 4444; AVX512BW-SLOW-NEXT: retq 4445; 4446; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: 4447; AVX512BW-FAST: # %bb.0: 4448; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 4449; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4450; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] 4451; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4452; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 4453; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4454; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 4455; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4456; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4457; AVX512BW-FAST-NEXT: vzeroupper 4458; AVX512BW-FAST-NEXT: retq 4459 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4460 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4461 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4462 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4463 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 4464 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 1, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4465 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 4466 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4467 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4468 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4469 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4470 ret void 4471} 4472 4473define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4474; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12: 4475; SSE2: # %bb.0: 4476; SSE2-NEXT: movdqa (%rdi), %xmm0 4477; SSE2-NEXT: paddb (%rsi), %xmm0 4478; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 4479; SSE2-NEXT: pand %xmm0, %xmm1 4480; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 4481; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4482; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4483; SSE2-NEXT: movaps 32(%rdx), %xmm2 4484; SSE2-NEXT: paddb 16(%rdx), %xmm0 4485; SSE2-NEXT: paddb (%rdx), %xmm1 4486; SSE2-NEXT: movaps %xmm2, 32(%rcx) 4487; SSE2-NEXT: movdqa %xmm1, (%rcx) 4488; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4489; SSE2-NEXT: retq 4490; 4491; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12: 4492; SSE42: # %bb.0: 4493; SSE42-NEXT: movdqa (%rdi), %xmm0 4494; SSE42-NEXT: paddb (%rsi), %xmm0 4495; SSE42-NEXT: pxor %xmm1, %xmm1 4496; SSE42-NEXT: pxor %xmm2, %xmm2 4497; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4498; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4499; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 4500; SSE42-NEXT: movaps 32(%rdx), %xmm1 4501; SSE42-NEXT: paddb 16(%rdx), %xmm0 4502; SSE42-NEXT: paddb (%rdx), %xmm2 4503; SSE42-NEXT: movaps %xmm1, 32(%rcx) 4504; SSE42-NEXT: movdqa %xmm2, (%rcx) 4505; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4506; SSE42-NEXT: retq 4507; 4508; AVX-LABEL: vec384_v24i16_to_v2i192_factor12: 4509; AVX: # %bb.0: 4510; AVX-NEXT: vmovdqa (%rdi), %xmm0 4511; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4512; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 4513; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4514; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4515; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 4516; AVX-NEXT: vmovaps 32(%rdx), %ymm1 4517; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4518; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 4519; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 4520; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4521; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4522; AVX-NEXT: vzeroupper 4523; AVX-NEXT: retq 4524; 4525; AVX2-LABEL: vec384_v24i16_to_v2i192_factor12: 4526; AVX2: # %bb.0: 4527; AVX2-NEXT: vmovdqa (%rdi), %xmm0 4528; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4529; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4530; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4531; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4532; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 4533; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4534; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 4535; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4536; AVX2-NEXT: vzeroupper 4537; AVX2-NEXT: retq 4538; 4539; AVX512F-LABEL: vec384_v24i16_to_v2i192_factor12: 4540; AVX512F: # %bb.0: 4541; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 4542; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4543; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 4544; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 4545; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4546; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4547; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 4548; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 4549; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4550; AVX512F-NEXT: vzeroupper 4551; AVX512F-NEXT: retq 4552; 4553; AVX512BW-LABEL: vec384_v24i16_to_v2i192_factor12: 4554; AVX512BW: # %bb.0: 4555; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 4556; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4557; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] 4558; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4559; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 4560; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 4561; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4562; AVX512BW-NEXT: vzeroupper 4563; AVX512BW-NEXT: retq 4564 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4565 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4566 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4567 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4568 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 4569 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4570 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 4571 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4572 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4573 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4574 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4575 ret void 4576} 4577 4578define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4579; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24: 4580; SSE2: # %bb.0: 4581; SSE2-NEXT: movdqa (%rdi), %xmm0 4582; SSE2-NEXT: paddb (%rsi), %xmm0 4583; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4584; SSE2-NEXT: movaps 16(%rdx), %xmm1 4585; SSE2-NEXT: movaps 32(%rdx), %xmm2 4586; SSE2-NEXT: paddb (%rdx), %xmm0 4587; SSE2-NEXT: movaps %xmm1, 16(%rcx) 4588; SSE2-NEXT: movaps %xmm2, 32(%rcx) 4589; SSE2-NEXT: movdqa %xmm0, (%rcx) 4590; SSE2-NEXT: retq 4591; 4592; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24: 4593; SSE42: # %bb.0: 4594; SSE42-NEXT: movdqa (%rdi), %xmm0 4595; SSE42-NEXT: paddb (%rsi), %xmm0 4596; SSE42-NEXT: pxor %xmm1, %xmm1 4597; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4598; SSE42-NEXT: movaps 16(%rdx), %xmm0 4599; SSE42-NEXT: movaps 32(%rdx), %xmm2 4600; SSE42-NEXT: paddb (%rdx), %xmm1 4601; SSE42-NEXT: movaps %xmm0, 16(%rcx) 4602; SSE42-NEXT: movaps %xmm2, 32(%rcx) 4603; SSE42-NEXT: movdqa %xmm1, (%rcx) 4604; SSE42-NEXT: retq 4605; 4606; AVX-LABEL: vec384_v24i16_to_v1i384_factor24: 4607; AVX: # %bb.0: 4608; AVX-NEXT: vmovdqa (%rdi), %xmm0 4609; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4610; AVX-NEXT: vmovaps 32(%rdx), %ymm1 4611; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 4612; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4613; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 4614; AVX-NEXT: vmovaps 16(%rdx), %xmm2 4615; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 4616; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 4617; AVX-NEXT: vmovdqa %xmm0, (%rcx) 4618; AVX-NEXT: vzeroupper 4619; AVX-NEXT: retq 4620; 4621; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24: 4622; AVX2: # %bb.0: 4623; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4624; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4625; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 4626; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 4627; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 4628; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4629; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 4630; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4631; AVX2-NEXT: vzeroupper 4632; AVX2-NEXT: retq 4633; 4634; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24: 4635; AVX512F: # %bb.0: 4636; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4637; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4638; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 4639; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 4640; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4641; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 4642; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 4643; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4644; AVX512F-NEXT: vzeroupper 4645; AVX512F-NEXT: retq 4646; 4647; AVX512BW-LABEL: vec384_v24i16_to_v1i384_factor24: 4648; AVX512BW: # %bb.0: 4649; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4650; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4651; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 4652; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4653; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4654; AVX512BW-NEXT: vzeroupper 4655; AVX512BW-NEXT: retq 4656 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4657 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4658 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4659 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4660 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> 4661 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4662 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> 4663 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4664 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4665 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4666 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4667 ret void 4668} 4669 4670define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4671; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2: 4672; SSE2: # %bb.0: 4673; SSE2-NEXT: movdqa (%rdi), %xmm0 4674; SSE2-NEXT: movdqa 16(%rdi), %xmm1 4675; SSE2-NEXT: paddb (%rsi), %xmm0 4676; SSE2-NEXT: paddb 16(%rsi), %xmm1 4677; SSE2-NEXT: pxor %xmm2, %xmm2 4678; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 4679; SSE2-NEXT: movdqa %xmm0, %xmm3 4680; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4681; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4682; SSE2-NEXT: paddb 16(%rdx), %xmm0 4683; SSE2-NEXT: paddb (%rdx), %xmm3 4684; SSE2-NEXT: paddb 32(%rdx), %xmm1 4685; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 4686; SSE2-NEXT: movdqa %xmm3, (%rcx) 4687; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4688; SSE2-NEXT: retq 4689; 4690; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2: 4691; SSE42: # %bb.0: 4692; SSE42-NEXT: movdqa (%rdi), %xmm0 4693; SSE42-NEXT: movdqa 16(%rdi), %xmm1 4694; SSE42-NEXT: paddb (%rsi), %xmm0 4695; SSE42-NEXT: paddb 16(%rsi), %xmm1 4696; SSE42-NEXT: pxor %xmm2, %xmm2 4697; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 4698; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 4699; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4700; SSE42-NEXT: paddb 16(%rdx), %xmm0 4701; SSE42-NEXT: paddb (%rdx), %xmm3 4702; SSE42-NEXT: paddb 32(%rdx), %xmm1 4703; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 4704; SSE42-NEXT: movdqa %xmm3, (%rcx) 4705; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4706; SSE42-NEXT: retq 4707; 4708; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: 4709; AVX: # %bb.0: 4710; AVX-NEXT: vmovdqa (%rdi), %xmm0 4711; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 4712; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 4713; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4714; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 4715; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4716; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 4717; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 4718; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 4719; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4720; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 4721; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4722; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4723; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 4724; AVX-NEXT: retq 4725; 4726; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2: 4727; AVX2: # %bb.0: 4728; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4729; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4730; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4731; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 4732; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4733; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4734; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4735; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4736; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4737; AVX2-NEXT: vzeroupper 4738; AVX2-NEXT: retq 4739; 4740; AVX512F-LABEL: vec384_v12i32_to_v6i64_factor2: 4741; AVX512F: # %bb.0: 4742; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4743; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4744; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 4745; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4746; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4747; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4748; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4749; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 4750; AVX512F-NEXT: vzeroupper 4751; AVX512F-NEXT: retq 4752; 4753; AVX512BW-LABEL: vec384_v12i32_to_v6i64_factor2: 4754; AVX512BW: # %bb.0: 4755; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 4756; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4757; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 4758; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4759; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4760; AVX512BW-NEXT: vzeroupper 4761; AVX512BW-NEXT: retq 4762 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4763 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4764 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4765 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4766 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> 4767 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 1, i32 15, i32 2, i32 17, i32 3, i32 19, i32 4, i32 21, i32 5, i32 23> 4768 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> 4769 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4770 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4771 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4772 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4773 ret void 4774} 4775 4776define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4777; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3: 4778; SSE2: # %bb.0: 4779; SSE2-NEXT: movdqa (%rdi), %xmm0 4780; SSE2-NEXT: paddb (%rsi), %xmm0 4781; SSE2-NEXT: xorps %xmm1, %xmm1 4782; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] 4783; SSE2-NEXT: pand %xmm0, %xmm2 4784; SSE2-NEXT: movdqa %xmm0, %xmm3 4785; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4786; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 4787; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4788; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] 4789; SSE2-NEXT: paddb (%rdx), %xmm0 4790; SSE2-NEXT: paddb 32(%rdx), %xmm3 4791; SSE2-NEXT: paddb 16(%rdx), %xmm2 4792; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 4793; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 4794; SSE2-NEXT: movdqa %xmm0, (%rcx) 4795; SSE2-NEXT: retq 4796; 4797; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3: 4798; SSE42: # %bb.0: 4799; SSE42-NEXT: movdqa (%rdi), %xmm0 4800; SSE42-NEXT: paddb (%rsi), %xmm0 4801; SSE42-NEXT: pxor %xmm1, %xmm1 4802; SSE42-NEXT: pxor %xmm2, %xmm2 4803; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] 4804; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 4805; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] 4806; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4807; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 4808; SSE42-NEXT: paddb (%rdx), %xmm0 4809; SSE42-NEXT: paddb 32(%rdx), %xmm3 4810; SSE42-NEXT: paddb 16(%rdx), %xmm2 4811; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4812; SSE42-NEXT: movdqa %xmm3, 32(%rcx) 4813; SSE42-NEXT: movdqa %xmm0, (%rcx) 4814; SSE42-NEXT: retq 4815; 4816; AVX-LABEL: vec384_v12i32_to_v4i96_factor3: 4817; AVX: # %bb.0: 4818; AVX-NEXT: vmovdqa (%rdi), %xmm0 4819; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4820; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 4821; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 4822; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 4823; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 4824; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4825; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 4826; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] 4827; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 4828; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4829; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4830; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4831; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4832; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4833; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4834; AVX-NEXT: vzeroupper 4835; AVX-NEXT: retq 4836; 4837; AVX2-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: 4838; AVX2-SLOW: # %bb.0: 4839; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 4840; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4841; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 4842; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] 4843; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 4844; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4845; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4846; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] 4847; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4848; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4849; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4850; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4851; AVX2-SLOW-NEXT: vzeroupper 4852; AVX2-SLOW-NEXT: retq 4853; 4854; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v4i96_factor3: 4855; AVX2-FAST-PERLANE: # %bb.0: 4856; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 4857; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4858; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 4859; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] 4860; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 4861; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 4862; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4863; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4864; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 4865; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 4866; AVX2-FAST-PERLANE-NEXT: vzeroupper 4867; AVX2-FAST-PERLANE-NEXT: retq 4868; 4869; AVX2-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: 4870; AVX2-FAST: # %bb.0: 4871; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 4872; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4873; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 4874; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] 4875; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] 4876; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 4877; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4878; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4879; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4880; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4881; AVX2-FAST-NEXT: vzeroupper 4882; AVX2-FAST-NEXT: retq 4883; 4884; AVX512F-LABEL: vec384_v12i32_to_v4i96_factor3: 4885; AVX512F: # %bb.0: 4886; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4887; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4888; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0] 4889; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 4890; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 4891; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4892; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4893; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 4894; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4895; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4896; AVX512F-NEXT: vzeroupper 4897; AVX512F-NEXT: retq 4898; 4899; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: 4900; AVX512BW-SLOW: # %bb.0: 4901; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 4902; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4903; AVX512BW-SLOW-NEXT: movb $73, %al 4904; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 4905; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} 4906; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4907; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4908; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] 4909; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4910; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4911; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4912; AVX512BW-SLOW-NEXT: vzeroupper 4913; AVX512BW-SLOW-NEXT: retq 4914; 4915; AVX512BW-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: 4916; AVX512BW-FAST: # %bb.0: 4917; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 4918; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4919; AVX512BW-FAST-NEXT: movb $73, %al 4920; AVX512BW-FAST-NEXT: kmovd %eax, %k1 4921; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} 4922; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 4923; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4924; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4925; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4926; AVX512BW-FAST-NEXT: vzeroupper 4927; AVX512BW-FAST-NEXT: retq 4928 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4929 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4930 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4931 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4932 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> 4933 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 2, i32 19, i32 20, i32 3, i32 22, i32 23> 4934 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> 4935 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4936 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4937 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4938 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4939 ret void 4940} 4941 4942define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4943; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4: 4944; SSE2: # %bb.0: 4945; SSE2-NEXT: movdqa (%rdi), %xmm0 4946; SSE2-NEXT: paddb (%rsi), %xmm0 4947; SSE2-NEXT: xorps %xmm1, %xmm1 4948; SSE2-NEXT: xorps %xmm2, %xmm2 4949; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 4950; SSE2-NEXT: movdqa %xmm0, %xmm3 4951; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 4952; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] 4953; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] 4954; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 4955; SSE2-NEXT: paddb 16(%rdx), %xmm0 4956; SSE2-NEXT: paddb 32(%rdx), %xmm3 4957; SSE2-NEXT: paddb (%rdx), %xmm2 4958; SSE2-NEXT: movdqa %xmm2, (%rcx) 4959; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 4960; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4961; SSE2-NEXT: retq 4962; 4963; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4: 4964; SSE42: # %bb.0: 4965; SSE42-NEXT: movdqa (%rdi), %xmm0 4966; SSE42-NEXT: paddb (%rsi), %xmm0 4967; SSE42-NEXT: pxor %xmm1, %xmm1 4968; SSE42-NEXT: pxor %xmm2, %xmm2 4969; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 4970; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 4971; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7] 4972; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 4973; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4974; SSE42-NEXT: paddb 16(%rdx), %xmm0 4975; SSE42-NEXT: paddb 32(%rdx), %xmm3 4976; SSE42-NEXT: paddb (%rdx), %xmm2 4977; SSE42-NEXT: movdqa %xmm2, (%rcx) 4978; SSE42-NEXT: movdqa %xmm3, 32(%rcx) 4979; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4980; SSE42-NEXT: retq 4981; 4982; AVX-LABEL: vec384_v12i32_to_v3i128_factor4: 4983; AVX: # %bb.0: 4984; AVX-NEXT: vmovdqa (%rdi), %xmm0 4985; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4986; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 4987; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4988; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 4989; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] 4990; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4991; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 4992; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 4993; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 4994; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4995; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4996; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4997; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4998; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4999; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5000; AVX-NEXT: vzeroupper 5001; AVX-NEXT: retq 5002; 5003; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: 5004; AVX2-SLOW: # %bb.0: 5005; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 5006; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 5007; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 5008; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 5009; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 5010; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] 5011; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 5012; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 5013; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] 5014; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5015; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 5016; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 5017; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 5018; AVX2-SLOW-NEXT: vzeroupper 5019; AVX2-SLOW-NEXT: retq 5020; 5021; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: 5022; AVX2-FAST-PERLANE: # %bb.0: 5023; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 5024; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 5025; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 5026; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5027; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 5028; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 5029; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] 5030; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5031; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 5032; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) 5033; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 5034; AVX2-FAST-PERLANE-NEXT: vzeroupper 5035; AVX2-FAST-PERLANE-NEXT: retq 5036; 5037; AVX2-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: 5038; AVX2-FAST: # %bb.0: 5039; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 5040; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5041; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 5042; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] 5043; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 5044; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] 5045; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5046; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5047; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5048; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 5049; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 5050; AVX2-FAST-NEXT: vzeroupper 5051; AVX2-FAST-NEXT: retq 5052; 5053; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4: 5054; AVX512F: # %bb.0: 5055; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5056; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5057; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0] 5058; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 5059; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 5060; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5061; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5062; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5063; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 5064; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 5065; AVX512F-NEXT: vzeroupper 5066; AVX512F-NEXT: retq 5067; 5068; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: 5069; AVX512BW-SLOW: # %bb.0: 5070; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 5071; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5072; AVX512BW-SLOW-NEXT: movb $17, %al 5073; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 5074; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} 5075; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 5076; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5077; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5078; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 5079; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5080; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 5081; AVX512BW-SLOW-NEXT: vzeroupper 5082; AVX512BW-SLOW-NEXT: retq 5083; 5084; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: 5085; AVX512BW-FAST: # %bb.0: 5086; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 5087; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5088; AVX512BW-FAST-NEXT: movb $17, %al 5089; AVX512BW-FAST-NEXT: kmovd %eax, %k1 5090; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} 5091; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5092; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 5093; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5094; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 5095; AVX512BW-FAST-NEXT: vzeroupper 5096; AVX512BW-FAST-NEXT: retq 5097 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5098 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5099 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5100 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5101 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> 5102 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 1, i32 17, i32 18, i32 19, i32 2, i32 21, i32 22, i32 23> 5103 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> 5104 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5105 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5106 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5107 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5108 ret void 5109} 5110 5111define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5112; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6: 5113; SSE2: # %bb.0: 5114; SSE2-NEXT: movdqa (%rdi), %xmm0 5115; SSE2-NEXT: paddb (%rsi), %xmm0 5116; SSE2-NEXT: xorps %xmm1, %xmm1 5117; SSE2-NEXT: xorps %xmm2, %xmm2 5118; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 5119; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] 5120; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] 5121; SSE2-NEXT: movaps 32(%rdx), %xmm0 5122; SSE2-NEXT: paddb 16(%rdx), %xmm1 5123; SSE2-NEXT: paddb (%rdx), %xmm2 5124; SSE2-NEXT: movaps %xmm0, 32(%rcx) 5125; SSE2-NEXT: movdqa %xmm2, (%rcx) 5126; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 5127; SSE2-NEXT: retq 5128; 5129; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6: 5130; SSE42: # %bb.0: 5131; SSE42-NEXT: movdqa (%rdi), %xmm0 5132; SSE42-NEXT: paddb (%rsi), %xmm0 5133; SSE42-NEXT: pxor %xmm1, %xmm1 5134; SSE42-NEXT: pxor %xmm2, %xmm2 5135; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5136; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 5137; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] 5138; SSE42-NEXT: movaps 32(%rdx), %xmm1 5139; SSE42-NEXT: paddb 16(%rdx), %xmm0 5140; SSE42-NEXT: paddb (%rdx), %xmm2 5141; SSE42-NEXT: movaps %xmm1, 32(%rcx) 5142; SSE42-NEXT: movdqa %xmm2, (%rcx) 5143; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5144; SSE42-NEXT: retq 5145; 5146; AVX-LABEL: vec384_v12i32_to_v2i192_factor6: 5147; AVX: # %bb.0: 5148; AVX-NEXT: vmovdqa (%rdi), %xmm0 5149; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5150; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 5151; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 5152; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 5153; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] 5154; AVX-NEXT: vmovaps 32(%rdx), %ymm1 5155; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 5156; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 5157; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5158; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5159; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5160; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5161; AVX-NEXT: vzeroupper 5162; AVX-NEXT: retq 5163; 5164; AVX2-SLOW-LABEL: vec384_v12i32_to_v2i192_factor6: 5165; AVX2-SLOW: # %bb.0: 5166; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 5167; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 5168; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 5169; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 5170; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 5171; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] 5172; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 5173; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5174; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) 5175; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 5176; AVX2-SLOW-NEXT: vzeroupper 5177; AVX2-SLOW-NEXT: retq 5178; 5179; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v2i192_factor6: 5180; AVX2-FAST-PERLANE: # %bb.0: 5181; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 5182; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 5183; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 5184; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 5185; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] 5186; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] 5187; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 5188; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5189; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) 5190; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 5191; AVX2-FAST-PERLANE-NEXT: vzeroupper 5192; AVX2-FAST-PERLANE-NEXT: retq 5193; 5194; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6: 5195; AVX2-FAST: # %bb.0: 5196; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 5197; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5198; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 5199; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] 5200; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] 5201; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 5202; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] 5203; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 5204; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5205; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) 5206; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 5207; AVX2-FAST-NEXT: vzeroupper 5208; AVX2-FAST-NEXT: retq 5209; 5210; AVX512F-LABEL: vec384_v12i32_to_v2i192_factor6: 5211; AVX512F: # %bb.0: 5212; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5213; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5214; AVX512F-NEXT: movb $65, %al 5215; AVX512F-NEXT: kmovw %eax, %k1 5216; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} 5217; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5218; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5219; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5220; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5221; AVX512F-NEXT: vzeroupper 5222; AVX512F-NEXT: retq 5223; 5224; AVX512BW-LABEL: vec384_v12i32_to_v2i192_factor6: 5225; AVX512BW: # %bb.0: 5226; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 5227; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5228; AVX512BW-NEXT: movb $65, %al 5229; AVX512BW-NEXT: kmovd %eax, %k1 5230; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} 5231; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5232; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5233; AVX512BW-NEXT: vzeroupper 5234; AVX512BW-NEXT: retq 5235 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5236 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5237 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5238 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5239 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> 5240 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 1, i32 19, i32 20, i32 21, i32 22, i32 23> 5241 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> 5242 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5243 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5244 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5245 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5246 ret void 5247} 5248 5249define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5250; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12: 5251; SSE2: # %bb.0: 5252; SSE2-NEXT: movdqa (%rdi), %xmm0 5253; SSE2-NEXT: paddb (%rsi), %xmm0 5254; SSE2-NEXT: xorps %xmm1, %xmm1 5255; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 5256; SSE2-NEXT: movaps 16(%rdx), %xmm0 5257; SSE2-NEXT: movaps 32(%rdx), %xmm2 5258; SSE2-NEXT: paddb (%rdx), %xmm1 5259; SSE2-NEXT: movaps %xmm0, 16(%rcx) 5260; SSE2-NEXT: movaps %xmm2, 32(%rcx) 5261; SSE2-NEXT: movdqa %xmm1, (%rcx) 5262; SSE2-NEXT: retq 5263; 5264; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12: 5265; SSE42: # %bb.0: 5266; SSE42-NEXT: movdqa (%rdi), %xmm0 5267; SSE42-NEXT: paddb (%rsi), %xmm0 5268; SSE42-NEXT: pxor %xmm1, %xmm1 5269; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5270; SSE42-NEXT: movaps 16(%rdx), %xmm0 5271; SSE42-NEXT: movaps 32(%rdx), %xmm2 5272; SSE42-NEXT: paddb (%rdx), %xmm1 5273; SSE42-NEXT: movaps %xmm0, 16(%rcx) 5274; SSE42-NEXT: movaps %xmm2, 32(%rcx) 5275; SSE42-NEXT: movdqa %xmm1, (%rcx) 5276; SSE42-NEXT: retq 5277; 5278; AVX-LABEL: vec384_v12i32_to_v1i384_factor12: 5279; AVX: # %bb.0: 5280; AVX-NEXT: vmovdqa (%rdi), %xmm0 5281; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5282; AVX-NEXT: vmovaps 32(%rdx), %ymm1 5283; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 5284; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5285; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5286; AVX-NEXT: vmovaps 16(%rdx), %xmm2 5287; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 5288; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5289; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5290; AVX-NEXT: vzeroupper 5291; AVX-NEXT: retq 5292; 5293; AVX2-LABEL: vec384_v12i32_to_v1i384_factor12: 5294; AVX2: # %bb.0: 5295; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5296; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5297; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5298; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5299; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5300; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5301; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5302; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5303; AVX2-NEXT: vzeroupper 5304; AVX2-NEXT: retq 5305; 5306; AVX512F-LABEL: vec384_v12i32_to_v1i384_factor12: 5307; AVX512F: # %bb.0: 5308; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5309; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5310; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5311; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5312; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5313; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5314; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5315; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5316; AVX512F-NEXT: vzeroupper 5317; AVX512F-NEXT: retq 5318; 5319; AVX512BW-LABEL: vec384_v12i32_to_v1i384_factor12: 5320; AVX512BW: # %bb.0: 5321; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5322; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5323; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 5324; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5325; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5326; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5327; AVX512BW-NEXT: vzeroupper 5328; AVX512BW-NEXT: retq 5329 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5330 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5331 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5332 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5333 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> 5334 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 5335 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> 5336 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5337 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5338 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5339 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5340 ret void 5341} 5342 5343define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5344; SSE-LABEL: vec384_v6i64_to_v3i128_factor2: 5345; SSE: # %bb.0: 5346; SSE-NEXT: movdqa (%rdi), %xmm0 5347; SSE-NEXT: movdqa 16(%rdi), %xmm1 5348; SSE-NEXT: paddb (%rsi), %xmm0 5349; SSE-NEXT: paddb 16(%rsi), %xmm1 5350; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 5351; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero 5352; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 5353; SSE-NEXT: paddb 16(%rdx), %xmm0 5354; SSE-NEXT: paddb (%rdx), %xmm2 5355; SSE-NEXT: paddb 32(%rdx), %xmm1 5356; SSE-NEXT: movdqa %xmm1, 32(%rcx) 5357; SSE-NEXT: movdqa %xmm2, (%rcx) 5358; SSE-NEXT: movdqa %xmm0, 16(%rcx) 5359; SSE-NEXT: retq 5360; 5361; AVX-LABEL: vec384_v6i64_to_v3i128_factor2: 5362; AVX: # %bb.0: 5363; AVX-NEXT: vmovdqa (%rdi), %xmm0 5364; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 5365; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 5366; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5367; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 5368; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5369; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] 5370; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 5371; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 5372; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 5373; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5374; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 5375; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 5376; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5377; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5378; AVX-NEXT: vzeroupper 5379; AVX-NEXT: retq 5380; 5381; AVX2-LABEL: vec384_v6i64_to_v3i128_factor2: 5382; AVX2: # %bb.0: 5383; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5384; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5385; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5386; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] 5387; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] 5388; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 5389; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5390; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5391; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5392; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5393; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5394; AVX2-NEXT: vzeroupper 5395; AVX2-NEXT: retq 5396; 5397; AVX512F-LABEL: vec384_v6i64_to_v3i128_factor2: 5398; AVX512F: # %bb.0: 5399; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5400; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5401; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5402; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0] 5403; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 5404; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5405; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5406; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5407; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 5408; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 5409; AVX512F-NEXT: vzeroupper 5410; AVX512F-NEXT: retq 5411; 5412; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2: 5413; AVX512BW: # %bb.0: 5414; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 5415; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5416; AVX512BW-NEXT: movb $5, %al 5417; AVX512BW-NEXT: kmovd %eax, %k1 5418; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} 5419; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 5420; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5421; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 5422; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5423; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5424; AVX512BW-NEXT: vzeroupper 5425; AVX512BW-NEXT: retq 5426 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5427 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5428 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5429 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5430 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> 5431 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 1, i32 9, i32 2, i32 11> 5432 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> 5433 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5434 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5435 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5436 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5437 ret void 5438} 5439 5440define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5441; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3: 5442; SSE2: # %bb.0: 5443; SSE2-NEXT: movdqa (%rdi), %xmm0 5444; SSE2-NEXT: paddb (%rsi), %xmm0 5445; SSE2-NEXT: pxor %xmm1, %xmm1 5446; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 5447; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 5448; SSE2-NEXT: movaps 32(%rdx), %xmm2 5449; SSE2-NEXT: paddb (%rdx), %xmm0 5450; SSE2-NEXT: paddb 16(%rdx), %xmm1 5451; SSE2-NEXT: movaps %xmm2, 32(%rcx) 5452; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 5453; SSE2-NEXT: movdqa %xmm0, (%rcx) 5454; SSE2-NEXT: retq 5455; 5456; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3: 5457; SSE42: # %bb.0: 5458; SSE42-NEXT: movdqa (%rdi), %xmm0 5459; SSE42-NEXT: paddb (%rsi), %xmm0 5460; SSE42-NEXT: pxor %xmm1, %xmm1 5461; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] 5462; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 5463; SSE42-NEXT: movaps 32(%rdx), %xmm2 5464; SSE42-NEXT: paddb (%rdx), %xmm0 5465; SSE42-NEXT: paddb 16(%rdx), %xmm1 5466; SSE42-NEXT: movaps %xmm2, 32(%rcx) 5467; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 5468; SSE42-NEXT: movdqa %xmm0, (%rcx) 5469; SSE42-NEXT: retq 5470; 5471; AVX-LABEL: vec384_v6i64_to_v2i192_factor3: 5472; AVX: # %bb.0: 5473; AVX-NEXT: vmovdqa (%rdi), %xmm0 5474; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5475; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] 5476; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 5477; AVX-NEXT: vmovaps 32(%rdx), %ymm1 5478; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 5479; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 5480; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5481; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5482; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5483; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5484; AVX-NEXT: vzeroupper 5485; AVX-NEXT: retq 5486; 5487; AVX2-LABEL: vec384_v6i64_to_v2i192_factor3: 5488; AVX2: # %bb.0: 5489; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5490; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5491; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5492; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 5493; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] 5494; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5495; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5496; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5497; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5498; AVX2-NEXT: vzeroupper 5499; AVX2-NEXT: retq 5500; 5501; AVX512F-LABEL: vec384_v6i64_to_v2i192_factor3: 5502; AVX512F: # %bb.0: 5503; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5504; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5505; AVX512F-NEXT: movb $9, %al 5506; AVX512F-NEXT: kmovw %eax, %k1 5507; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} 5508; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5509; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5510; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5511; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5512; AVX512F-NEXT: vzeroupper 5513; AVX512F-NEXT: retq 5514; 5515; AVX512BW-LABEL: vec384_v6i64_to_v2i192_factor3: 5516; AVX512BW: # %bb.0: 5517; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 5518; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5519; AVX512BW-NEXT: movb $9, %al 5520; AVX512BW-NEXT: kmovd %eax, %k1 5521; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} 5522; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5523; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5524; AVX512BW-NEXT: vzeroupper 5525; AVX512BW-NEXT: retq 5526 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5527 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5528 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5529 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5530 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> 5531 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 1, i32 10, i32 11> 5532 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> 5533 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5534 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5535 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5536 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5537 ret void 5538} 5539 5540define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5541; SSE-LABEL: vec384_v6i64_to_v1i384_factor6: 5542; SSE: # %bb.0: 5543; SSE-NEXT: movdqa (%rdi), %xmm0 5544; SSE-NEXT: paddb (%rsi), %xmm0 5545; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 5546; SSE-NEXT: movaps 16(%rdx), %xmm1 5547; SSE-NEXT: movaps 32(%rdx), %xmm2 5548; SSE-NEXT: paddb (%rdx), %xmm0 5549; SSE-NEXT: movaps %xmm1, 16(%rcx) 5550; SSE-NEXT: movaps %xmm2, 32(%rcx) 5551; SSE-NEXT: movdqa %xmm0, (%rcx) 5552; SSE-NEXT: retq 5553; 5554; AVX-LABEL: vec384_v6i64_to_v1i384_factor6: 5555; AVX: # %bb.0: 5556; AVX-NEXT: vmovdqa (%rdi), %xmm0 5557; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5558; AVX-NEXT: vmovaps 32(%rdx), %ymm1 5559; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5560; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5561; AVX-NEXT: vmovaps 16(%rdx), %xmm2 5562; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 5563; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5564; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5565; AVX-NEXT: vzeroupper 5566; AVX-NEXT: retq 5567; 5568; AVX2-LABEL: vec384_v6i64_to_v1i384_factor6: 5569; AVX2: # %bb.0: 5570; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5571; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5572; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5573; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5574; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5575; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5576; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5577; AVX2-NEXT: vzeroupper 5578; AVX2-NEXT: retq 5579; 5580; AVX512F-LABEL: vec384_v6i64_to_v1i384_factor6: 5581; AVX512F: # %bb.0: 5582; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5583; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5584; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5585; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5586; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5587; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5588; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5589; AVX512F-NEXT: vzeroupper 5590; AVX512F-NEXT: retq 5591; 5592; AVX512BW-LABEL: vec384_v6i64_to_v1i384_factor6: 5593; AVX512BW: # %bb.0: 5594; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5595; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5596; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5597; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5598; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5599; AVX512BW-NEXT: vzeroupper 5600; AVX512BW-NEXT: retq 5601 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5602 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5603 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5604 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5605 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> 5606 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 9, i32 10, i32 11> 5607 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> 5608 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5609 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5610 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5611 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5612 ret void 5613} 5614 5615define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5616; SSE-LABEL: vec384_v3i128_to_v1i384_factor3: 5617; SSE: # %bb.0: 5618; SSE-NEXT: movdqa (%rdi), %xmm0 5619; SSE-NEXT: paddb (%rsi), %xmm0 5620; SSE-NEXT: movaps 16(%rdx), %xmm1 5621; SSE-NEXT: movaps 32(%rdx), %xmm2 5622; SSE-NEXT: paddb (%rdx), %xmm0 5623; SSE-NEXT: movaps %xmm1, 16(%rcx) 5624; SSE-NEXT: movaps %xmm2, 32(%rcx) 5625; SSE-NEXT: movdqa %xmm0, (%rcx) 5626; SSE-NEXT: retq 5627; 5628; AVX-LABEL: vec384_v3i128_to_v1i384_factor3: 5629; AVX: # %bb.0: 5630; AVX-NEXT: vmovdqa (%rdi), %xmm0 5631; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5632; AVX-NEXT: vmovaps 32(%rdx), %ymm1 5633; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5634; AVX-NEXT: vmovaps 16(%rdx), %xmm2 5635; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 5636; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5637; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5638; AVX-NEXT: vzeroupper 5639; AVX-NEXT: retq 5640; 5641; AVX2-LABEL: vec384_v3i128_to_v1i384_factor3: 5642; AVX2: # %bb.0: 5643; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5644; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5645; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5646; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5647; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5648; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5649; AVX2-NEXT: vzeroupper 5650; AVX2-NEXT: retq 5651; 5652; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3: 5653; AVX512F: # %bb.0: 5654; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5655; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5656; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5657; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5658; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5659; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5660; AVX512F-NEXT: vzeroupper 5661; AVX512F-NEXT: retq 5662; 5663; AVX512BW-LABEL: vec384_v3i128_to_v1i384_factor3: 5664; AVX512BW: # %bb.0: 5665; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5666; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5667; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5668; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5669; AVX512BW-NEXT: vzeroupper 5670; AVX512BW-NEXT: retq 5671 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5672 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5673 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5674 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5675 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <3 x i128> 5676 %zextd.vec = shufflevector <3 x i128> %in.vec.cast, <3 x i128> zeroinitializer, <3 x i32> <i32 0, i32 4, i32 5> 5677 %out.bytevec = bitcast <3 x i128> %zextd.vec to <48 x i8> 5678 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5679 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5680 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5681 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5682 ret void 5683} 5684 5685define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5686; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2: 5687; SSE2: # %bb.0: 5688; SSE2-NEXT: movdqa (%rdi), %xmm0 5689; SSE2-NEXT: movdqa 16(%rdi), %xmm1 5690; SSE2-NEXT: paddb (%rsi), %xmm0 5691; SSE2-NEXT: paddb 16(%rsi), %xmm1 5692; SSE2-NEXT: pxor %xmm2, %xmm2 5693; SSE2-NEXT: movdqa %xmm1, %xmm3 5694; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 5695; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 5696; SSE2-NEXT: movdqa %xmm0, %xmm4 5697; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 5698; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 5699; SSE2-NEXT: paddb 16(%rdx), %xmm0 5700; SSE2-NEXT: paddb (%rdx), %xmm4 5701; SSE2-NEXT: paddb 48(%rdx), %xmm1 5702; SSE2-NEXT: paddb 32(%rdx), %xmm3 5703; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 5704; SSE2-NEXT: movdqa %xmm1, 48(%rcx) 5705; SSE2-NEXT: movdqa %xmm4, (%rcx) 5706; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5707; SSE2-NEXT: retq 5708; 5709; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: 5710; SSE42: # %bb.0: 5711; SSE42-NEXT: movdqa (%rdi), %xmm0 5712; SSE42-NEXT: movdqa 16(%rdi), %xmm1 5713; SSE42-NEXT: paddb (%rsi), %xmm0 5714; SSE42-NEXT: paddb 16(%rsi), %xmm1 5715; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5716; SSE42-NEXT: pxor %xmm3, %xmm3 5717; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 5718; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 5719; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 5720; SSE42-NEXT: paddb 16(%rdx), %xmm0 5721; SSE42-NEXT: paddb (%rdx), %xmm4 5722; SSE42-NEXT: paddb 48(%rdx), %xmm1 5723; SSE42-NEXT: paddb 32(%rdx), %xmm2 5724; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 5725; SSE42-NEXT: movdqa %xmm1, 48(%rcx) 5726; SSE42-NEXT: movdqa %xmm4, (%rcx) 5727; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5728; SSE42-NEXT: retq 5729; 5730; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: 5731; AVX: # %bb.0: 5732; AVX-NEXT: vmovdqa (%rdi), %xmm0 5733; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 5734; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 5735; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5736; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 5737; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 5738; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 5739; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 5740; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 5741; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 5742; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 5743; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 5744; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 5745; AVX-NEXT: vmovdqa %xmm2, (%rcx) 5746; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 5747; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5748; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5749; AVX-NEXT: retq 5750; 5751; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: 5752; AVX2: # %bb.0: 5753; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5754; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5755; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 5756; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 5757; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 5758; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5759; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5760; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5761; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5762; AVX2-NEXT: vzeroupper 5763; AVX2-NEXT: retq 5764; 5765; AVX512F-LABEL: vec512_v64i8_to_v32i16_factor2: 5766; AVX512F: # %bb.0: 5767; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5768; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5769; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 5770; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 5771; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 5772; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5773; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5774; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 5775; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 5776; AVX512F-NEXT: vzeroupper 5777; AVX512F-NEXT: retq 5778; 5779; AVX512BW-LABEL: vec512_v64i8_to_v32i16_factor2: 5780; AVX512BW: # %bb.0: 5781; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 5782; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5783; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 5784; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5785; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5786; AVX512BW-NEXT: vzeroupper 5787; AVX512BW-NEXT: retq 5788 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5789 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5790 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5791 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 1, i32 67, i32 2, i32 69, i32 3, i32 71, i32 4, i32 73, i32 5, i32 75, i32 6, i32 77, i32 7, i32 79, i32 8, i32 81, i32 9, i32 83, i32 10, i32 85, i32 11, i32 87, i32 12, i32 89, i32 13, i32 91, i32 14, i32 93, i32 15, i32 95, i32 16, i32 97, i32 17, i32 99, i32 18, i32 101, i32 19, i32 103, i32 20, i32 105, i32 21, i32 107, i32 22, i32 109, i32 23, i32 111, i32 24, i32 113, i32 25, i32 115, i32 26, i32 117, i32 27, i32 119, i32 28, i32 121, i32 29, i32 123, i32 30, i32 125, i32 31, i32 127> 5792 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5793 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 5794 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5795 ret void 5796} 5797 5798define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5799; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4: 5800; SSE2: # %bb.0: 5801; SSE2-NEXT: movdqa (%rdi), %xmm0 5802; SSE2-NEXT: paddb (%rsi), %xmm0 5803; SSE2-NEXT: pxor %xmm1, %xmm1 5804; SSE2-NEXT: movdqa %xmm0, %xmm2 5805; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 5806; SSE2-NEXT: movdqa %xmm2, %xmm3 5807; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 5808; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5809; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5810; SSE2-NEXT: movdqa %xmm0, %xmm4 5811; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 5812; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5813; SSE2-NEXT: paddb 16(%rdx), %xmm0 5814; SSE2-NEXT: paddb (%rdx), %xmm4 5815; SSE2-NEXT: paddb 48(%rdx), %xmm2 5816; SSE2-NEXT: paddb 32(%rdx), %xmm3 5817; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 5818; SSE2-NEXT: movdqa %xmm2, 48(%rcx) 5819; SSE2-NEXT: movdqa %xmm4, (%rcx) 5820; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5821; SSE2-NEXT: retq 5822; 5823; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4: 5824; SSE42: # %bb.0: 5825; SSE42-NEXT: movdqa (%rdi), %xmm0 5826; SSE42-NEXT: paddb (%rsi), %xmm0 5827; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5828; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 5829; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 5830; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 5831; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 5832; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 5833; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5834; SSE42-NEXT: paddb 16(%rdx), %xmm0 5835; SSE42-NEXT: paddb 48(%rdx), %xmm3 5836; SSE42-NEXT: paddb 32(%rdx), %xmm2 5837; SSE42-NEXT: paddb (%rdx), %xmm1 5838; SSE42-NEXT: movdqa %xmm1, (%rcx) 5839; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 5840; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 5841; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5842; SSE42-NEXT: retq 5843; 5844; AVX-LABEL: vec512_v64i8_to_v16i32_factor4: 5845; AVX: # %bb.0: 5846; AVX-NEXT: vmovdqa (%rdi), %xmm0 5847; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5848; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5849; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 5850; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 5851; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 5852; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 5853; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5854; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5855; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 5856; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 5857; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 5858; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5859; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5860; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5861; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5862; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) 5863; AVX-NEXT: retq 5864; 5865; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4: 5866; AVX2: # %bb.0: 5867; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5868; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5869; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 5870; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 5871; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 5872; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5873; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5874; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5875; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5876; AVX2-NEXT: vzeroupper 5877; AVX2-NEXT: retq 5878; 5879; AVX512F-LABEL: vec512_v64i8_to_v16i32_factor4: 5880; AVX512F: # %bb.0: 5881; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5882; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5883; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 5884; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 5885; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 5886; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5887; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5888; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5889; AVX512F-NEXT: vzeroupper 5890; AVX512F-NEXT: retq 5891; 5892; AVX512BW-LABEL: vec512_v64i8_to_v16i32_factor4: 5893; AVX512BW: # %bb.0: 5894; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5895; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5896; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 5897; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5898; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5899; AVX512BW-NEXT: vzeroupper 5900; AVX512BW-NEXT: retq 5901 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5902 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5903 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5904 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 1, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 3, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 5, i32 85, i32 86, i32 87, i32 6, i32 89, i32 90, i32 91, i32 7, i32 93, i32 94, i32 95, i32 8, i32 97, i32 98, i32 99, i32 9, i32 101, i32 102, i32 103, i32 10, i32 105, i32 106, i32 107, i32 11, i32 109, i32 110, i32 111, i32 12, i32 113, i32 114, i32 115, i32 13, i32 117, i32 118, i32 119, i32 14, i32 121, i32 122, i32 123, i32 15, i32 125, i32 126, i32 127> 5905 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5906 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 5907 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5908 ret void 5909} 5910 5911define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5912; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8: 5913; SSE2: # %bb.0: 5914; SSE2-NEXT: movdqa (%rdi), %xmm0 5915; SSE2-NEXT: paddb (%rsi), %xmm0 5916; SSE2-NEXT: pxor %xmm1, %xmm1 5917; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5918; SSE2-NEXT: movdqa %xmm0, %xmm2 5919; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5920; SSE2-NEXT: movdqa %xmm2, %xmm3 5921; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5922; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 5923; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5924; SSE2-NEXT: movdqa %xmm0, %xmm4 5925; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5926; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5927; SSE2-NEXT: paddb 16(%rdx), %xmm0 5928; SSE2-NEXT: paddb (%rdx), %xmm4 5929; SSE2-NEXT: paddb 48(%rdx), %xmm2 5930; SSE2-NEXT: paddb 32(%rdx), %xmm3 5931; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 5932; SSE2-NEXT: movdqa %xmm2, 48(%rcx) 5933; SSE2-NEXT: movdqa %xmm4, (%rcx) 5934; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5935; SSE2-NEXT: retq 5936; 5937; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: 5938; SSE42: # %bb.0: 5939; SSE42-NEXT: movdqa (%rdi), %xmm0 5940; SSE42-NEXT: paddb (%rsi), %xmm0 5941; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 5942; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 5943; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 5944; SSE42-NEXT: movdqa %xmm0, %xmm3 5945; SSE42-NEXT: psrlq $48, %xmm3 5946; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 5947; SSE42-NEXT: psrld $16, %xmm0 5948; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 5949; SSE42-NEXT: paddb 16(%rdx), %xmm0 5950; SSE42-NEXT: paddb 48(%rdx), %xmm3 5951; SSE42-NEXT: paddb 32(%rdx), %xmm2 5952; SSE42-NEXT: paddb (%rdx), %xmm1 5953; SSE42-NEXT: movdqa %xmm1, (%rcx) 5954; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 5955; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 5956; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5957; SSE42-NEXT: retq 5958; 5959; AVX-LABEL: vec512_v64i8_to_v8i64_factor8: 5960; AVX: # %bb.0: 5961; AVX-NEXT: vmovdqa (%rdi), %xmm0 5962; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5963; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 5964; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 5965; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 5966; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 5967; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 5968; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 5969; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 5970; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 5971; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 5972; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 5973; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5974; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5975; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 5976; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5977; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) 5978; AVX-NEXT: retq 5979; 5980; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8: 5981; AVX2: # %bb.0: 5982; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5983; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5984; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 5985; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 5986; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 5987; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5988; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5989; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5990; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5991; AVX2-NEXT: vzeroupper 5992; AVX2-NEXT: retq 5993; 5994; AVX512F-LABEL: vec512_v64i8_to_v8i64_factor8: 5995; AVX512F: # %bb.0: 5996; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5997; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5998; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero 5999; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6000; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 6001; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6002; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6003; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6004; AVX512F-NEXT: vzeroupper 6005; AVX512F-NEXT: retq 6006; 6007; AVX512BW-LABEL: vec512_v64i8_to_v8i64_factor8: 6008; AVX512BW: # %bb.0: 6009; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 6010; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6011; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero 6012; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6013; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6014; AVX512BW-NEXT: vzeroupper 6015; AVX512BW-NEXT: retq 6016 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6017 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6018 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6019 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 3, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 4, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 5, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 6, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 7, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6020 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6021 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 6022 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6023 ret void 6024} 6025 6026define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6027; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16: 6028; SSE2: # %bb.0: 6029; SSE2-NEXT: movdqa (%rdi), %xmm0 6030; SSE2-NEXT: paddb (%rsi), %xmm0 6031; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] 6032; SSE2-NEXT: pand %xmm0, %xmm1 6033; SSE2-NEXT: movdqa %xmm0, %xmm2 6034; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 6035; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6036; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 6037; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6038; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6039; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6040; SSE2-NEXT: paddb 16(%rdx), %xmm0 6041; SSE2-NEXT: paddb 48(%rdx), %xmm3 6042; SSE2-NEXT: paddb 32(%rdx), %xmm2 6043; SSE2-NEXT: paddb (%rdx), %xmm1 6044; SSE2-NEXT: movdqa %xmm1, (%rcx) 6045; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 6046; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6047; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 6048; SSE2-NEXT: retq 6049; 6050; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16: 6051; SSE42: # %bb.0: 6052; SSE42-NEXT: movdqa (%rdi), %xmm0 6053; SSE42-NEXT: paddb (%rsi), %xmm0 6054; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] 6055; SSE42-NEXT: pand %xmm0, %xmm1 6056; SSE42-NEXT: movdqa %xmm0, %xmm2 6057; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 6058; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6059; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 6060; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6061; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6062; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6063; SSE42-NEXT: paddb 16(%rdx), %xmm0 6064; SSE42-NEXT: paddb 48(%rdx), %xmm3 6065; SSE42-NEXT: paddb 32(%rdx), %xmm2 6066; SSE42-NEXT: paddb (%rdx), %xmm1 6067; SSE42-NEXT: movdqa %xmm1, (%rcx) 6068; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 6069; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6070; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 6071; SSE42-NEXT: retq 6072; 6073; AVX-LABEL: vec512_v64i8_to_v4i128_factor16: 6074; AVX: # %bb.0: 6075; AVX-NEXT: vmovdqa (%rdi), %xmm0 6076; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6077; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 6078; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6079; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6080; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] 6081; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6082; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6083; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6084; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 6085; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 6086; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 6087; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 6088; AVX-NEXT: vmovdqa %xmm1, (%rcx) 6089; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 6090; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6091; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) 6092; AVX-NEXT: retq 6093; 6094; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: 6095; AVX2-SLOW: # %bb.0: 6096; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 6097; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6098; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6099; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6100; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6101; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] 6102; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 6103; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 6104; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6105; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 6106; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 6107; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6108; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6109; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 6110; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 6111; AVX2-SLOW-NEXT: vzeroupper 6112; AVX2-SLOW-NEXT: retq 6113; 6114; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16: 6115; AVX2-FAST-PERLANE: # %bb.0: 6116; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 6117; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6118; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6119; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6120; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6121; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] 6122; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 6123; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] 6124; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 6125; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 6126; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6127; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6128; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 6129; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 6130; AVX2-FAST-PERLANE-NEXT: vzeroupper 6131; AVX2-FAST-PERLANE-NEXT: retq 6132; 6133; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: 6134; AVX2-FAST: # %bb.0: 6135; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 6136; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6137; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6138; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6139; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6140; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] 6141; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 6142; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] 6143; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 6144; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 6145; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6146; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6147; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 6148; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 6149; AVX2-FAST-NEXT: vzeroupper 6150; AVX2-FAST-NEXT: retq 6151; 6152; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: 6153; AVX512F-SLOW: # %bb.0: 6154; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 6155; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6156; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6157; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 6158; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6159; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 6160; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] 6161; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6162; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6163; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 6164; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6165; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 6166; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6167; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 6168; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) 6169; AVX512F-SLOW-NEXT: vzeroupper 6170; AVX512F-SLOW-NEXT: retq 6171; 6172; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: 6173; AVX512F-FAST: # %bb.0: 6174; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 6175; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6176; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] 6177; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6178; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 6179; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] 6180; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6181; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6182; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 6183; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6184; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 6185; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6186; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 6187; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) 6188; AVX512F-FAST-NEXT: vzeroupper 6189; AVX512F-FAST-NEXT: retq 6190; 6191; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: 6192; AVX512BW-SLOW: # %bb.0: 6193; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 6194; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6195; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6196; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 6197; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6198; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 6199; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] 6200; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6201; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6202; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 6203; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6204; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 6205; AVX512BW-SLOW-NEXT: vzeroupper 6206; AVX512BW-SLOW-NEXT: retq 6207; 6208; AVX512BW-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: 6209; AVX512BW-FAST: # %bb.0: 6210; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 6211; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6212; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] 6213; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 6214; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 6215; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] 6216; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 6217; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 6218; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 6219; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6220; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 6221; AVX512BW-FAST-NEXT: vzeroupper 6222; AVX512BW-FAST-NEXT: retq 6223 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6224 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6225 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6226 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 1, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 2, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 3, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6227 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6228 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 6229 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6230 ret void 6231} 6232 6233define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6234; SSE2-LABEL: vec512_v64i8_to_v2i256_factor32: 6235; SSE2: # %bb.0: 6236; SSE2-NEXT: movdqa (%rdi), %xmm0 6237; SSE2-NEXT: paddb (%rsi), %xmm0 6238; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] 6239; SSE2-NEXT: pand %xmm0, %xmm1 6240; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6241; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6242; SSE2-NEXT: movaps 16(%rdx), %xmm2 6243; SSE2-NEXT: movaps 48(%rdx), %xmm3 6244; SSE2-NEXT: paddb 32(%rdx), %xmm0 6245; SSE2-NEXT: paddb (%rdx), %xmm1 6246; SSE2-NEXT: movaps %xmm3, 48(%rcx) 6247; SSE2-NEXT: movaps %xmm2, 16(%rcx) 6248; SSE2-NEXT: movdqa %xmm1, (%rcx) 6249; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6250; SSE2-NEXT: retq 6251; 6252; SSE42-LABEL: vec512_v64i8_to_v2i256_factor32: 6253; SSE42: # %bb.0: 6254; SSE42-NEXT: movdqa (%rdi), %xmm0 6255; SSE42-NEXT: paddb (%rsi), %xmm0 6256; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] 6257; SSE42-NEXT: pand %xmm0, %xmm1 6258; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6259; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6260; SSE42-NEXT: movaps 16(%rdx), %xmm2 6261; SSE42-NEXT: movaps 48(%rdx), %xmm3 6262; SSE42-NEXT: paddb 32(%rdx), %xmm0 6263; SSE42-NEXT: paddb (%rdx), %xmm1 6264; SSE42-NEXT: movaps %xmm3, 48(%rcx) 6265; SSE42-NEXT: movaps %xmm2, 16(%rcx) 6266; SSE42-NEXT: movdqa %xmm1, (%rcx) 6267; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6268; SSE42-NEXT: retq 6269; 6270; AVX-LABEL: vec512_v64i8_to_v2i256_factor32: 6271; AVX: # %bb.0: 6272; AVX-NEXT: vmovdqa (%rdi), %xmm0 6273; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6274; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 6275; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6276; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 6277; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 6278; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6279; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6280; AVX-NEXT: vmovaps 48(%rdx), %xmm3 6281; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6282; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 6283; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6284; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 6285; AVX-NEXT: retq 6286; 6287; AVX2-LABEL: vec512_v64i8_to_v2i256_factor32: 6288; AVX2: # %bb.0: 6289; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6290; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6291; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6292; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 6293; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6294; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6295; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6296; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 6297; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 6298; AVX2-NEXT: vzeroupper 6299; AVX2-NEXT: retq 6300; 6301; AVX512F-LABEL: vec512_v64i8_to_v2i256_factor32: 6302; AVX512F: # %bb.0: 6303; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6304; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6305; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6306; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 6307; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6308; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6309; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6310; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6311; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6312; AVX512F-NEXT: vzeroupper 6313; AVX512F-NEXT: retq 6314; 6315; AVX512BW-LABEL: vec512_v64i8_to_v2i256_factor32: 6316; AVX512BW: # %bb.0: 6317; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 6318; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6319; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6320; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1 6321; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6322; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 6323; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6324; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6325; AVX512BW-NEXT: vzeroupper 6326; AVX512BW-NEXT: retq 6327 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6328 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6329 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6330 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 1, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6331 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6332 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 6333 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6334 ret void 6335} 6336 6337define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6338; SSE-LABEL: vec512_v64i8_to_v1i512_factor64: 6339; SSE: # %bb.0: 6340; SSE-NEXT: movdqa (%rdi), %xmm0 6341; SSE-NEXT: paddb (%rsi), %xmm0 6342; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6343; SSE-NEXT: movaps 16(%rdx), %xmm1 6344; SSE-NEXT: movaps 32(%rdx), %xmm2 6345; SSE-NEXT: movaps 48(%rdx), %xmm3 6346; SSE-NEXT: paddb (%rdx), %xmm0 6347; SSE-NEXT: movaps %xmm2, 32(%rcx) 6348; SSE-NEXT: movaps %xmm3, 48(%rcx) 6349; SSE-NEXT: movaps %xmm1, 16(%rcx) 6350; SSE-NEXT: movdqa %xmm0, (%rcx) 6351; SSE-NEXT: retq 6352; 6353; AVX-LABEL: vec512_v64i8_to_v1i512_factor64: 6354; AVX: # %bb.0: 6355; AVX-NEXT: vmovdqa (%rdi), %xmm0 6356; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6357; AVX-NEXT: vmovaps 32(%rdx), %ymm1 6358; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 6359; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6360; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6361; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6362; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 6363; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6364; AVX-NEXT: vzeroupper 6365; AVX-NEXT: retq 6366; 6367; AVX2-LABEL: vec512_v64i8_to_v1i512_factor64: 6368; AVX2: # %bb.0: 6369; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6370; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6371; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6372; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 6373; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 6374; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6375; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 6376; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6377; AVX2-NEXT: vzeroupper 6378; AVX2-NEXT: retq 6379; 6380; AVX512F-LABEL: vec512_v64i8_to_v1i512_factor64: 6381; AVX512F: # %bb.0: 6382; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6383; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6384; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6385; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 6386; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6387; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 6388; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 6389; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6390; AVX512F-NEXT: vzeroupper 6391; AVX512F-NEXT: retq 6392; 6393; AVX512BW-LABEL: vec512_v64i8_to_v1i512_factor64: 6394; AVX512BW: # %bb.0: 6395; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6396; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6397; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6398; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 6399; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6400; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6401; AVX512BW-NEXT: vzeroupper 6402; AVX512BW-NEXT: retq 6403 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6404 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6405 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6406 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6407 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6408 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias 6409 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6410 ret void 6411} 6412 6413define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6414; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2: 6415; SSE2: # %bb.0: 6416; SSE2-NEXT: movdqa (%rdi), %xmm0 6417; SSE2-NEXT: movdqa 16(%rdi), %xmm1 6418; SSE2-NEXT: paddb (%rsi), %xmm0 6419; SSE2-NEXT: paddb 16(%rsi), %xmm1 6420; SSE2-NEXT: pxor %xmm2, %xmm2 6421; SSE2-NEXT: movdqa %xmm1, %xmm3 6422; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 6423; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 6424; SSE2-NEXT: movdqa %xmm0, %xmm4 6425; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 6426; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 6427; SSE2-NEXT: paddb 16(%rdx), %xmm0 6428; SSE2-NEXT: paddb (%rdx), %xmm4 6429; SSE2-NEXT: paddb 48(%rdx), %xmm1 6430; SSE2-NEXT: paddb 32(%rdx), %xmm3 6431; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 6432; SSE2-NEXT: movdqa %xmm1, 48(%rcx) 6433; SSE2-NEXT: movdqa %xmm4, (%rcx) 6434; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 6435; SSE2-NEXT: retq 6436; 6437; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: 6438; SSE42: # %bb.0: 6439; SSE42-NEXT: movdqa (%rdi), %xmm0 6440; SSE42-NEXT: movdqa 16(%rdi), %xmm1 6441; SSE42-NEXT: paddb (%rsi), %xmm0 6442; SSE42-NEXT: paddb 16(%rsi), %xmm1 6443; SSE42-NEXT: pxor %xmm2, %xmm2 6444; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 6445; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 6446; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 6447; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 6448; SSE42-NEXT: paddb 16(%rdx), %xmm0 6449; SSE42-NEXT: paddb (%rdx), %xmm4 6450; SSE42-NEXT: paddb 48(%rdx), %xmm1 6451; SSE42-NEXT: paddb 32(%rdx), %xmm3 6452; SSE42-NEXT: movdqa %xmm3, 32(%rcx) 6453; SSE42-NEXT: movdqa %xmm1, 48(%rcx) 6454; SSE42-NEXT: movdqa %xmm4, (%rcx) 6455; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 6456; SSE42-NEXT: retq 6457; 6458; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: 6459; AVX: # %bb.0: 6460; AVX-NEXT: vmovdqa (%rdi), %xmm0 6461; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 6462; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 6463; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6464; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 6465; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 6466; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 6467; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 6468; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 6469; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 6470; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 6471; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 6472; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 6473; AVX-NEXT: vmovdqa %xmm2, (%rcx) 6474; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 6475; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6476; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 6477; AVX-NEXT: retq 6478; 6479; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: 6480; AVX2: # %bb.0: 6481; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6482; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6483; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6484; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 6485; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 6486; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6487; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6488; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 6489; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 6490; AVX2-NEXT: vzeroupper 6491; AVX2-NEXT: retq 6492; 6493; AVX512F-LABEL: vec512_v32i16_to_v16i32_factor2: 6494; AVX512F: # %bb.0: 6495; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6496; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6497; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 6498; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6499; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 6500; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6501; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6502; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6503; AVX512F-NEXT: vzeroupper 6504; AVX512F-NEXT: retq 6505; 6506; AVX512BW-LABEL: vec512_v32i16_to_v16i32_factor2: 6507; AVX512BW: # %bb.0: 6508; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 6509; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6510; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 6511; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6512; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6513; AVX512BW-NEXT: vzeroupper 6514; AVX512BW-NEXT: retq 6515 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6516 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6517 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6518 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6519 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63> 6520 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> 6521 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6522 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6523 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6524 ret void 6525} 6526 6527define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6528; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4: 6529; SSE2: # %bb.0: 6530; SSE2-NEXT: movdqa (%rdi), %xmm0 6531; SSE2-NEXT: paddb (%rsi), %xmm0 6532; SSE2-NEXT: pxor %xmm1, %xmm1 6533; SSE2-NEXT: movdqa %xmm0, %xmm2 6534; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 6535; SSE2-NEXT: movdqa %xmm2, %xmm3 6536; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 6537; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 6538; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6539; SSE2-NEXT: movdqa %xmm0, %xmm4 6540; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 6541; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6542; SSE2-NEXT: paddb 16(%rdx), %xmm0 6543; SSE2-NEXT: paddb (%rdx), %xmm4 6544; SSE2-NEXT: paddb 48(%rdx), %xmm2 6545; SSE2-NEXT: paddb 32(%rdx), %xmm3 6546; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 6547; SSE2-NEXT: movdqa %xmm2, 48(%rcx) 6548; SSE2-NEXT: movdqa %xmm4, (%rcx) 6549; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 6550; SSE2-NEXT: retq 6551; 6552; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4: 6553; SSE42: # %bb.0: 6554; SSE42-NEXT: movdqa (%rdi), %xmm0 6555; SSE42-NEXT: paddb (%rsi), %xmm0 6556; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6557; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 6558; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6559; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 6560; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6561; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 6562; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6563; SSE42-NEXT: paddb 16(%rdx), %xmm0 6564; SSE42-NEXT: paddb 48(%rdx), %xmm3 6565; SSE42-NEXT: paddb 32(%rdx), %xmm2 6566; SSE42-NEXT: paddb (%rdx), %xmm1 6567; SSE42-NEXT: movdqa %xmm1, (%rcx) 6568; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 6569; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6570; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 6571; SSE42-NEXT: retq 6572; 6573; AVX-LABEL: vec512_v32i16_to_v8i64_factor4: 6574; AVX: # %bb.0: 6575; AVX-NEXT: vmovdqa (%rdi), %xmm0 6576; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6577; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6578; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 6579; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 6580; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 6581; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 6582; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 6583; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6584; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 6585; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 6586; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 6587; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 6588; AVX-NEXT: vmovdqa %xmm1, (%rcx) 6589; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 6590; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6591; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) 6592; AVX-NEXT: retq 6593; 6594; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: 6595; AVX2: # %bb.0: 6596; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6597; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6598; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 6599; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 6600; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 6601; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6602; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6603; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 6604; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 6605; AVX2-NEXT: vzeroupper 6606; AVX2-NEXT: retq 6607; 6608; AVX512F-LABEL: vec512_v32i16_to_v8i64_factor4: 6609; AVX512F: # %bb.0: 6610; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 6611; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6612; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 6613; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6614; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 6615; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6616; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6617; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6618; AVX512F-NEXT: vzeroupper 6619; AVX512F-NEXT: retq 6620; 6621; AVX512BW-LABEL: vec512_v32i16_to_v8i64_factor4: 6622; AVX512BW: # %bb.0: 6623; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 6624; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6625; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 6626; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6627; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6628; AVX512BW-NEXT: vzeroupper 6629; AVX512BW-NEXT: retq 6630 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6631 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6632 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6633 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6634 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63> 6635 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> 6636 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6637 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6638 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6639 ret void 6640} 6641 6642define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6643; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8: 6644; SSE2: # %bb.0: 6645; SSE2-NEXT: movdqa (%rdi), %xmm0 6646; SSE2-NEXT: paddb (%rsi), %xmm0 6647; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 6648; SSE2-NEXT: pand %xmm0, %xmm1 6649; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 6650; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 6651; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 6652; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6653; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6654; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6655; SSE2-NEXT: paddb 16(%rdx), %xmm3 6656; SSE2-NEXT: paddb 48(%rdx), %xmm2 6657; SSE2-NEXT: paddb 32(%rdx), %xmm0 6658; SSE2-NEXT: paddb (%rdx), %xmm1 6659; SSE2-NEXT: movdqa %xmm1, (%rcx) 6660; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6661; SSE2-NEXT: movdqa %xmm2, 48(%rcx) 6662; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 6663; SSE2-NEXT: retq 6664; 6665; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8: 6666; SSE42: # %bb.0: 6667; SSE42-NEXT: movdqa (%rdi), %xmm0 6668; SSE42-NEXT: paddb (%rsi), %xmm0 6669; SSE42-NEXT: pxor %xmm1, %xmm1 6670; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6671; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 6672; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 6673; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 6674; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6675; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6676; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6677; SSE42-NEXT: paddb 16(%rdx), %xmm3 6678; SSE42-NEXT: paddb 48(%rdx), %xmm2 6679; SSE42-NEXT: paddb 32(%rdx), %xmm0 6680; SSE42-NEXT: paddb (%rdx), %xmm1 6681; SSE42-NEXT: movdqa %xmm1, (%rcx) 6682; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6683; SSE42-NEXT: movdqa %xmm2, 48(%rcx) 6684; SSE42-NEXT: movdqa %xmm3, 16(%rcx) 6685; SSE42-NEXT: retq 6686; 6687; AVX-LABEL: vec512_v32i16_to_v4i128_factor8: 6688; AVX: # %bb.0: 6689; AVX-NEXT: vmovdqa (%rdi), %xmm0 6690; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6691; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 6692; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6693; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 6694; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6695; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 6696; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6697; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6698; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6699; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 6700; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 6701; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 6702; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 6703; AVX-NEXT: vmovdqa %xmm1, (%rcx) 6704; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 6705; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6706; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) 6707; AVX-NEXT: retq 6708; 6709; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: 6710; AVX2-SLOW: # %bb.0: 6711; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 6712; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 6713; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 6714; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6715; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] 6716; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] 6717; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 6718; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6719; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6720; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 6721; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6722; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6723; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 6724; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 6725; AVX2-SLOW-NEXT: vzeroupper 6726; AVX2-SLOW-NEXT: retq 6727; 6728; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8: 6729; AVX2-FAST-PERLANE: # %bb.0: 6730; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 6731; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 6732; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 6733; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6734; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] 6735; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] 6736; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] 6737; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6738; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 6739; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6740; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6741; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 6742; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 6743; AVX2-FAST-PERLANE-NEXT: vzeroupper 6744; AVX2-FAST-PERLANE-NEXT: retq 6745; 6746; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: 6747; AVX2-FAST: # %bb.0: 6748; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 6749; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 6750; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 6751; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 6752; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] 6753; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] 6754; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] 6755; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6756; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] 6757; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6758; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6759; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 6760; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 6761; AVX2-FAST-NEXT: vzeroupper 6762; AVX2-FAST-NEXT: retq 6763; 6764; AVX512F-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: 6765; AVX512F-SLOW: # %bb.0: 6766; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 6767; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6768; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6769; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6770; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 6771; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 6772; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 6773; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6774; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 6775; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 6776; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6777; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6778; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 6779; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 6780; AVX512F-SLOW-NEXT: vzeroupper 6781; AVX512F-SLOW-NEXT: retq 6782; 6783; AVX512F-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: 6784; AVX512F-FAST: # %bb.0: 6785; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 6786; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6787; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 6788; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 6789; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 6790; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 6791; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] 6792; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] 6793; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 6794; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6795; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6796; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) 6797; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 6798; AVX512F-FAST-NEXT: vzeroupper 6799; AVX512F-FAST-NEXT: retq 6800; 6801; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: 6802; AVX512BW: # %bb.0: 6803; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6804; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6805; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] 6806; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 6807; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 6808; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6809; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6810; AVX512BW-NEXT: vzeroupper 6811; AVX512BW-NEXT: retq 6812 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6813 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6814 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6815 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6816 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 6817 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> 6818 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6819 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6820 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6821 ret void 6822} 6823 6824define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6825; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16: 6826; SSE2: # %bb.0: 6827; SSE2-NEXT: movdqa (%rdi), %xmm0 6828; SSE2-NEXT: paddb (%rsi), %xmm0 6829; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 6830; SSE2-NEXT: pand %xmm0, %xmm1 6831; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6832; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6833; SSE2-NEXT: movaps 16(%rdx), %xmm2 6834; SSE2-NEXT: movaps 48(%rdx), %xmm3 6835; SSE2-NEXT: paddb 32(%rdx), %xmm0 6836; SSE2-NEXT: paddb (%rdx), %xmm1 6837; SSE2-NEXT: movaps %xmm3, 48(%rcx) 6838; SSE2-NEXT: movaps %xmm2, 16(%rcx) 6839; SSE2-NEXT: movdqa %xmm1, (%rcx) 6840; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6841; SSE2-NEXT: retq 6842; 6843; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16: 6844; SSE42: # %bb.0: 6845; SSE42-NEXT: movdqa (%rdi), %xmm0 6846; SSE42-NEXT: paddb (%rsi), %xmm0 6847; SSE42-NEXT: pxor %xmm1, %xmm1 6848; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6849; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6850; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6851; SSE42-NEXT: movaps 16(%rdx), %xmm2 6852; SSE42-NEXT: movaps 48(%rdx), %xmm3 6853; SSE42-NEXT: paddb 32(%rdx), %xmm0 6854; SSE42-NEXT: paddb (%rdx), %xmm1 6855; SSE42-NEXT: movaps %xmm3, 48(%rcx) 6856; SSE42-NEXT: movaps %xmm2, 16(%rcx) 6857; SSE42-NEXT: movdqa %xmm1, (%rcx) 6858; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6859; SSE42-NEXT: retq 6860; 6861; AVX-LABEL: vec512_v32i16_to_v2i256_factor16: 6862; AVX: # %bb.0: 6863; AVX-NEXT: vmovdqa (%rdi), %xmm0 6864; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6865; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 6866; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6867; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 6868; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 6869; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 6870; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6871; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6872; AVX-NEXT: vmovaps 48(%rdx), %xmm3 6873; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6874; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 6875; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6876; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 6877; AVX-NEXT: retq 6878; 6879; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16: 6880; AVX2: # %bb.0: 6881; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6882; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6883; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6884; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 6885; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6886; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6887; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6888; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 6889; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 6890; AVX2-NEXT: vzeroupper 6891; AVX2-NEXT: retq 6892; 6893; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16: 6894; AVX512F: # %bb.0: 6895; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6896; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6897; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6898; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 6899; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6900; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6901; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 6902; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6903; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6904; AVX512F-NEXT: vzeroupper 6905; AVX512F-NEXT: retq 6906; 6907; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16: 6908; AVX512BW: # %bb.0: 6909; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 6910; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6911; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6912; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 6913; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6914; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 6915; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6916; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6917; AVX512BW-NEXT: vzeroupper 6918; AVX512BW-NEXT: retq 6919 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6920 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6921 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6922 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6923 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 6924 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> 6925 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6926 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6927 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6928 ret void 6929} 6930 6931define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6932; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32: 6933; SSE2: # %bb.0: 6934; SSE2-NEXT: movdqa (%rdi), %xmm0 6935; SSE2-NEXT: paddb (%rsi), %xmm0 6936; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6937; SSE2-NEXT: movaps 16(%rdx), %xmm1 6938; SSE2-NEXT: movaps 32(%rdx), %xmm2 6939; SSE2-NEXT: movaps 48(%rdx), %xmm3 6940; SSE2-NEXT: paddb (%rdx), %xmm0 6941; SSE2-NEXT: movaps %xmm2, 32(%rcx) 6942; SSE2-NEXT: movaps %xmm3, 48(%rcx) 6943; SSE2-NEXT: movaps %xmm1, 16(%rcx) 6944; SSE2-NEXT: movdqa %xmm0, (%rcx) 6945; SSE2-NEXT: retq 6946; 6947; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32: 6948; SSE42: # %bb.0: 6949; SSE42-NEXT: movdqa (%rdi), %xmm0 6950; SSE42-NEXT: paddb (%rsi), %xmm0 6951; SSE42-NEXT: pxor %xmm1, %xmm1 6952; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6953; SSE42-NEXT: movaps 16(%rdx), %xmm0 6954; SSE42-NEXT: movaps 32(%rdx), %xmm2 6955; SSE42-NEXT: movaps 48(%rdx), %xmm3 6956; SSE42-NEXT: paddb (%rdx), %xmm1 6957; SSE42-NEXT: movaps %xmm2, 32(%rcx) 6958; SSE42-NEXT: movaps %xmm3, 48(%rcx) 6959; SSE42-NEXT: movaps %xmm0, 16(%rcx) 6960; SSE42-NEXT: movdqa %xmm1, (%rcx) 6961; SSE42-NEXT: retq 6962; 6963; AVX-LABEL: vec512_v32i16_to_v1i512_factor32: 6964; AVX: # %bb.0: 6965; AVX-NEXT: vmovdqa (%rdi), %xmm0 6966; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6967; AVX-NEXT: vmovaps 32(%rdx), %ymm1 6968; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 6969; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 6970; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6971; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6972; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6973; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 6974; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6975; AVX-NEXT: vzeroupper 6976; AVX-NEXT: retq 6977; 6978; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32: 6979; AVX2: # %bb.0: 6980; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6981; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6982; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6983; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 6984; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 6985; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6986; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 6987; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6988; AVX2-NEXT: vzeroupper 6989; AVX2-NEXT: retq 6990; 6991; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32: 6992; AVX512F: # %bb.0: 6993; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6994; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6995; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6996; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 6997; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6998; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 6999; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 7000; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7001; AVX512F-NEXT: vzeroupper 7002; AVX512F-NEXT: retq 7003; 7004; AVX512BW-LABEL: vec512_v32i16_to_v1i512_factor32: 7005; AVX512BW: # %bb.0: 7006; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 7007; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 7008; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 7009; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7010; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7011; AVX512BW-NEXT: vzeroupper 7012; AVX512BW-NEXT: retq 7013 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7014 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7015 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7016 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 7017 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 7018 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> 7019 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7020 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7021 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7022 ret void 7023} 7024 7025define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7026; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2: 7027; SSE2: # %bb.0: 7028; SSE2-NEXT: movdqa (%rdi), %xmm0 7029; SSE2-NEXT: movdqa 16(%rdi), %xmm1 7030; SSE2-NEXT: paddb (%rsi), %xmm0 7031; SSE2-NEXT: paddb 16(%rsi), %xmm1 7032; SSE2-NEXT: pxor %xmm2, %xmm2 7033; SSE2-NEXT: movdqa %xmm1, %xmm3 7034; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7035; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 7036; SSE2-NEXT: movdqa %xmm0, %xmm4 7037; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 7038; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7039; SSE2-NEXT: paddb 16(%rdx), %xmm0 7040; SSE2-NEXT: paddb (%rdx), %xmm4 7041; SSE2-NEXT: paddb 48(%rdx), %xmm1 7042; SSE2-NEXT: paddb 32(%rdx), %xmm3 7043; SSE2-NEXT: movdqa %xmm3, 32(%rcx) 7044; SSE2-NEXT: movdqa %xmm1, 48(%rcx) 7045; SSE2-NEXT: movdqa %xmm4, (%rcx) 7046; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 7047; SSE2-NEXT: retq 7048; 7049; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: 7050; SSE42: # %bb.0: 7051; SSE42-NEXT: movdqa (%rdi), %xmm0 7052; SSE42-NEXT: movdqa 16(%rdi), %xmm1 7053; SSE42-NEXT: paddb (%rsi), %xmm0 7054; SSE42-NEXT: paddb 16(%rsi), %xmm1 7055; SSE42-NEXT: pxor %xmm2, %xmm2 7056; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 7057; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 7058; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero 7059; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7060; SSE42-NEXT: paddb 16(%rdx), %xmm0 7061; SSE42-NEXT: paddb (%rdx), %xmm4 7062; SSE42-NEXT: paddb 48(%rdx), %xmm1 7063; SSE42-NEXT: paddb 32(%rdx), %xmm3 7064; SSE42-NEXT: movdqa %xmm3, 32(%rcx) 7065; SSE42-NEXT: movdqa %xmm1, 48(%rcx) 7066; SSE42-NEXT: movdqa %xmm4, (%rcx) 7067; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 7068; SSE42-NEXT: retq 7069; 7070; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: 7071; AVX: # %bb.0: 7072; AVX-NEXT: vmovdqa (%rdi), %xmm0 7073; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 7074; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 7075; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7076; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 7077; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 7078; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 7079; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero 7080; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 7081; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 7082; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 7083; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 7084; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 7085; AVX-NEXT: vmovdqa %xmm2, (%rcx) 7086; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 7087; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 7088; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 7089; AVX-NEXT: retq 7090; 7091; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: 7092; AVX2: # %bb.0: 7093; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7094; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7095; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 7096; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 7097; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 7098; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7099; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 7100; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 7101; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 7102; AVX2-NEXT: vzeroupper 7103; AVX2-NEXT: retq 7104; 7105; AVX512F-LABEL: vec512_v16i32_to_v8i64_factor2: 7106; AVX512F: # %bb.0: 7107; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7108; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7109; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 7110; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7111; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7112; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7113; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7114; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7115; AVX512F-NEXT: vzeroupper 7116; AVX512F-NEXT: retq 7117; 7118; AVX512BW-LABEL: vec512_v16i32_to_v8i64_factor2: 7119; AVX512BW: # %bb.0: 7120; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 7121; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7122; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 7123; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7124; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7125; AVX512BW-NEXT: vzeroupper 7126; AVX512BW-NEXT: retq 7127 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7128 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7129 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7130 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 7131 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> 7132 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> 7133 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7134 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7135 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7136 ret void 7137} 7138 7139define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7140; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4: 7141; SSE2: # %bb.0: 7142; SSE2-NEXT: movdqa (%rdi), %xmm0 7143; SSE2-NEXT: paddb (%rsi), %xmm0 7144; SSE2-NEXT: xorps %xmm1, %xmm1 7145; SSE2-NEXT: movdqa %xmm0, %xmm2 7146; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7147; SSE2-NEXT: xorps %xmm3, %xmm3 7148; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] 7149; SSE2-NEXT: movdqa %xmm0, %xmm4 7150; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7151; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] 7152; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] 7153; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 7154; SSE2-NEXT: paddb 16(%rdx), %xmm0 7155; SSE2-NEXT: paddb 32(%rdx), %xmm4 7156; SSE2-NEXT: paddb (%rdx), %xmm3 7157; SSE2-NEXT: paddb 48(%rdx), %xmm2 7158; SSE2-NEXT: movdqa %xmm2, 48(%rcx) 7159; SSE2-NEXT: movdqa %xmm3, (%rcx) 7160; SSE2-NEXT: movdqa %xmm4, 32(%rcx) 7161; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 7162; SSE2-NEXT: retq 7163; 7164; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4: 7165; SSE42: # %bb.0: 7166; SSE42-NEXT: movdqa (%rdi), %xmm0 7167; SSE42-NEXT: paddb (%rsi), %xmm0 7168; SSE42-NEXT: pxor %xmm1, %xmm1 7169; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7170; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 7171; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 7172; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7173; SSE42-NEXT: pxor %xmm4, %xmm4 7174; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] 7175; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] 7176; SSE42-NEXT: paddb 16(%rdx), %xmm3 7177; SSE42-NEXT: paddb 32(%rdx), %xmm2 7178; SSE42-NEXT: paddb (%rdx), %xmm1 7179; SSE42-NEXT: paddb 48(%rdx), %xmm0 7180; SSE42-NEXT: movdqa %xmm0, 48(%rcx) 7181; SSE42-NEXT: movdqa %xmm1, (%rcx) 7182; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 7183; SSE42-NEXT: movdqa %xmm3, 16(%rcx) 7184; SSE42-NEXT: retq 7185; 7186; AVX-LABEL: vec512_v16i32_to_v4i128_factor4: 7187; AVX: # %bb.0: 7188; AVX-NEXT: vmovdqa (%rdi), %xmm0 7189; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7190; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 7191; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 7192; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7193; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] 7194; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 7195; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 7196; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 7197; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 7198; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 7199; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 7200; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 7201; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 7202; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 7203; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 7204; AVX-NEXT: vmovdqa %xmm2, (%rcx) 7205; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 7206; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 7207; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 7208; AVX-NEXT: vzeroupper 7209; AVX-NEXT: retq 7210; 7211; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4: 7212; AVX2-SLOW: # %bb.0: 7213; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 7214; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 7215; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 7216; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 7217; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] 7218; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] 7219; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 7220; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 7221; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] 7222; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7223; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7224; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 7225; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 7226; AVX2-SLOW-NEXT: vzeroupper 7227; AVX2-SLOW-NEXT: retq 7228; 7229; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4: 7230; AVX2-FAST-PERLANE: # %bb.0: 7231; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 7232; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 7233; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 7234; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 7235; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] 7236; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] 7237; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 7238; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] 7239; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] 7240; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7241; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7242; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 7243; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 7244; AVX2-FAST-PERLANE-NEXT: vzeroupper 7245; AVX2-FAST-PERLANE-NEXT: retq 7246; 7247; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4: 7248; AVX2-FAST: # %bb.0: 7249; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 7250; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7251; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 7252; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] 7253; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 7254; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] 7255; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0] 7256; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 7257; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 7258; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7259; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7260; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 7261; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 7262; AVX2-FAST-NEXT: vzeroupper 7263; AVX2-FAST-NEXT: retq 7264; 7265; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4: 7266; AVX512F: # %bb.0: 7267; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7268; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7269; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 7270; AVX512F-NEXT: kmovw %eax, %k1 7271; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 7272; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7273; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7274; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7275; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7276; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7277; AVX512F-NEXT: vzeroupper 7278; AVX512F-NEXT: retq 7279; 7280; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4: 7281; AVX512BW: # %bb.0: 7282; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 7283; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7284; AVX512BW-NEXT: movb $17, %al 7285; AVX512BW-NEXT: kmovd %eax, %k1 7286; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} 7287; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 7288; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] 7289; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 7290; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 7291; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7292; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7293; AVX512BW-NEXT: vzeroupper 7294; AVX512BW-NEXT: retq 7295 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7296 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7297 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7298 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 7299 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> 7300 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> 7301 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7302 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7303 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7304 ret void 7305} 7306 7307define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7308; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8: 7309; SSE2: # %bb.0: 7310; SSE2-NEXT: movdqa (%rdi), %xmm0 7311; SSE2-NEXT: paddb (%rsi), %xmm0 7312; SSE2-NEXT: xorps %xmm1, %xmm1 7313; SSE2-NEXT: xorps %xmm2, %xmm2 7314; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 7315; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] 7316; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 7317; SSE2-NEXT: movaps 16(%rdx), %xmm1 7318; SSE2-NEXT: movaps 48(%rdx), %xmm3 7319; SSE2-NEXT: paddb 32(%rdx), %xmm0 7320; SSE2-NEXT: paddb (%rdx), %xmm2 7321; SSE2-NEXT: movaps %xmm3, 48(%rcx) 7322; SSE2-NEXT: movaps %xmm1, 16(%rcx) 7323; SSE2-NEXT: movdqa %xmm2, (%rcx) 7324; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 7325; SSE2-NEXT: retq 7326; 7327; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8: 7328; SSE42: # %bb.0: 7329; SSE42-NEXT: movdqa (%rdi), %xmm0 7330; SSE42-NEXT: paddb (%rsi), %xmm0 7331; SSE42-NEXT: pxor %xmm1, %xmm1 7332; SSE42-NEXT: pxor %xmm2, %xmm2 7333; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 7334; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7335; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7336; SSE42-NEXT: movaps 16(%rdx), %xmm1 7337; SSE42-NEXT: movaps 48(%rdx), %xmm3 7338; SSE42-NEXT: paddb 32(%rdx), %xmm0 7339; SSE42-NEXT: paddb (%rdx), %xmm2 7340; SSE42-NEXT: movaps %xmm3, 48(%rcx) 7341; SSE42-NEXT: movaps %xmm1, 16(%rcx) 7342; SSE42-NEXT: movdqa %xmm2, (%rcx) 7343; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 7344; SSE42-NEXT: retq 7345; 7346; AVX-LABEL: vec512_v16i32_to_v2i256_factor8: 7347; AVX: # %bb.0: 7348; AVX-NEXT: vmovdqa (%rdi), %xmm0 7349; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7350; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7351; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 7352; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 7353; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 7354; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 7355; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7356; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7357; AVX-NEXT: vmovaps 48(%rdx), %xmm3 7358; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7359; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 7360; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7361; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7362; AVX-NEXT: retq 7363; 7364; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: 7365; AVX2-SLOW: # %bb.0: 7366; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 7367; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7368; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 7369; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7370; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7371; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7372; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7373; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7374; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 7375; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 7376; AVX2-SLOW-NEXT: vzeroupper 7377; AVX2-SLOW-NEXT: retq 7378; 7379; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v2i256_factor8: 7380; AVX2-FAST-PERLANE: # %bb.0: 7381; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 7382; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7383; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 7384; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7385; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7386; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7387; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 7388; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 7389; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 7390; AVX2-FAST-PERLANE-NEXT: vzeroupper 7391; AVX2-FAST-PERLANE-NEXT: retq 7392; 7393; AVX2-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: 7394; AVX2-FAST: # %bb.0: 7395; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 7396; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7397; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 7398; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7399; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7400; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7401; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 7402; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 7403; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 7404; AVX2-FAST-NEXT: vzeroupper 7405; AVX2-FAST-NEXT: retq 7406; 7407; AVX512F-LABEL: vec512_v16i32_to_v2i256_factor8: 7408; AVX512F: # %bb.0: 7409; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7410; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7411; AVX512F-NEXT: movw $257, %ax # imm = 0x101 7412; AVX512F-NEXT: kmovw %eax, %k1 7413; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 7414; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7415; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7416; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7417; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7418; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7419; AVX512F-NEXT: vzeroupper 7420; AVX512F-NEXT: retq 7421; 7422; AVX512BW-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: 7423; AVX512BW-SLOW: # %bb.0: 7424; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 7425; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7426; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 7427; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7428; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 7429; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7430; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 7431; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7432; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 7433; AVX512BW-SLOW-NEXT: vzeroupper 7434; AVX512BW-SLOW-NEXT: retq 7435; 7436; AVX512BW-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: 7437; AVX512BW-FAST: # %bb.0: 7438; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 7439; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7440; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 7441; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7442; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7443; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7444; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7445; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 7446; AVX512BW-FAST-NEXT: vzeroupper 7447; AVX512BW-FAST-NEXT: retq 7448 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7449 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7450 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7451 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 7452 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 7453 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> 7454 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7455 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7456 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7457 ret void 7458} 7459 7460define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7461; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16: 7462; SSE2: # %bb.0: 7463; SSE2-NEXT: movdqa (%rdi), %xmm0 7464; SSE2-NEXT: paddb (%rsi), %xmm0 7465; SSE2-NEXT: xorps %xmm1, %xmm1 7466; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 7467; SSE2-NEXT: movaps 16(%rdx), %xmm0 7468; SSE2-NEXT: movaps 32(%rdx), %xmm2 7469; SSE2-NEXT: movaps 48(%rdx), %xmm3 7470; SSE2-NEXT: paddb (%rdx), %xmm1 7471; SSE2-NEXT: movaps %xmm2, 32(%rcx) 7472; SSE2-NEXT: movaps %xmm3, 48(%rcx) 7473; SSE2-NEXT: movaps %xmm0, 16(%rcx) 7474; SSE2-NEXT: movdqa %xmm1, (%rcx) 7475; SSE2-NEXT: retq 7476; 7477; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16: 7478; SSE42: # %bb.0: 7479; SSE42-NEXT: movdqa (%rdi), %xmm0 7480; SSE42-NEXT: paddb (%rsi), %xmm0 7481; SSE42-NEXT: pxor %xmm1, %xmm1 7482; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7483; SSE42-NEXT: movaps 16(%rdx), %xmm0 7484; SSE42-NEXT: movaps 32(%rdx), %xmm2 7485; SSE42-NEXT: movaps 48(%rdx), %xmm3 7486; SSE42-NEXT: paddb (%rdx), %xmm1 7487; SSE42-NEXT: movaps %xmm2, 32(%rcx) 7488; SSE42-NEXT: movaps %xmm3, 48(%rcx) 7489; SSE42-NEXT: movaps %xmm0, 16(%rcx) 7490; SSE42-NEXT: movdqa %xmm1, (%rcx) 7491; SSE42-NEXT: retq 7492; 7493; AVX-LABEL: vec512_v16i32_to_v1i512_factor16: 7494; AVX: # %bb.0: 7495; AVX-NEXT: vmovdqa (%rdi), %xmm0 7496; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7497; AVX-NEXT: vmovaps 32(%rdx), %ymm1 7498; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 7499; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 7500; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7501; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7502; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7503; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 7504; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7505; AVX-NEXT: vzeroupper 7506; AVX-NEXT: retq 7507; 7508; AVX2-LABEL: vec512_v16i32_to_v1i512_factor16: 7509; AVX2: # %bb.0: 7510; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7511; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7512; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 7513; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7514; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 7515; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7516; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 7517; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7518; AVX2-NEXT: vzeroupper 7519; AVX2-NEXT: retq 7520; 7521; AVX512F-LABEL: vec512_v16i32_to_v1i512_factor16: 7522; AVX512F: # %bb.0: 7523; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7524; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7525; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 7526; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7527; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7528; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 7529; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 7530; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7531; AVX512F-NEXT: vzeroupper 7532; AVX512F-NEXT: retq 7533; 7534; AVX512BW-LABEL: vec512_v16i32_to_v1i512_factor16: 7535; AVX512BW: # %bb.0: 7536; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 7537; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 7538; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 7539; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 7540; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7541; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7542; AVX512BW-NEXT: vzeroupper 7543; AVX512BW-NEXT: retq 7544 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7545 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7546 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7547 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 7548 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 7549 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> 7550 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7551 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7552 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7553 ret void 7554} 7555 7556define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7557; SSE-LABEL: vec512_v8i64_to_v4i128_factor2: 7558; SSE: # %bb.0: 7559; SSE-NEXT: movdqa (%rdi), %xmm0 7560; SSE-NEXT: movdqa 16(%rdi), %xmm1 7561; SSE-NEXT: paddb (%rsi), %xmm0 7562; SSE-NEXT: paddb 16(%rsi), %xmm1 7563; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 7564; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7565; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero 7566; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7567; SSE-NEXT: paddb 16(%rdx), %xmm0 7568; SSE-NEXT: paddb (%rdx), %xmm3 7569; SSE-NEXT: paddb 48(%rdx), %xmm1 7570; SSE-NEXT: paddb 32(%rdx), %xmm2 7571; SSE-NEXT: movdqa %xmm2, 32(%rcx) 7572; SSE-NEXT: movdqa %xmm1, 48(%rcx) 7573; SSE-NEXT: movdqa %xmm3, (%rcx) 7574; SSE-NEXT: movdqa %xmm0, 16(%rcx) 7575; SSE-NEXT: retq 7576; 7577; AVX-LABEL: vec512_v8i64_to_v4i128_factor2: 7578; AVX: # %bb.0: 7579; AVX-NEXT: vmovdqa (%rdi), %xmm0 7580; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 7581; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 7582; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7583; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 7584; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7585; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] 7586; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 7587; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] 7588; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 7589; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 7590; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 7591; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 7592; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 7593; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7594; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7595; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 7596; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7597; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 7598; AVX-NEXT: vzeroupper 7599; AVX-NEXT: retq 7600; 7601; AVX2-LABEL: vec512_v8i64_to_v4i128_factor2: 7602; AVX2: # %bb.0: 7603; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7604; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7605; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 7606; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] 7607; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] 7608; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] 7609; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 7610; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7611; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7612; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 7613; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 7614; AVX2-NEXT: vzeroupper 7615; AVX2-NEXT: retq 7616; 7617; AVX512F-LABEL: vec512_v8i64_to_v4i128_factor2: 7618; AVX512F: # %bb.0: 7619; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7620; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7621; AVX512F-NEXT: movb $85, %al 7622; AVX512F-NEXT: kmovw %eax, %k1 7623; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 7624; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7625; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7626; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7627; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7628; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7629; AVX512F-NEXT: vzeroupper 7630; AVX512F-NEXT: retq 7631; 7632; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2: 7633; AVX512BW-SLOW: # %bb.0: 7634; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 7635; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7636; AVX512BW-SLOW-NEXT: movb $5, %al 7637; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 7638; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} 7639; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] 7640; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 7641; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] 7642; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7643; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7644; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 7645; AVX512BW-SLOW-NEXT: vzeroupper 7646; AVX512BW-SLOW-NEXT: retq 7647; 7648; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2: 7649; AVX512BW-FAST: # %bb.0: 7650; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 7651; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7652; AVX512BW-FAST-NEXT: movb $5, %al 7653; AVX512BW-FAST-NEXT: kmovd %eax, %k1 7654; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} 7655; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 7656; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7] 7657; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 7658; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 7659; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7660; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 7661; AVX512BW-FAST-NEXT: vzeroupper 7662; AVX512BW-FAST-NEXT: retq 7663 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7664 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7665 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7666 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 7667 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> 7668 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> 7669 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7670 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7671 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7672 ret void 7673} 7674 7675define void @vec512_v8i64_to_v2i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7676; SSE-LABEL: vec512_v8i64_to_v2i256_factor4: 7677; SSE: # %bb.0: 7678; SSE-NEXT: movdqa (%rdi), %xmm0 7679; SSE-NEXT: paddb (%rsi), %xmm0 7680; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 7681; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7682; SSE-NEXT: movaps 16(%rdx), %xmm2 7683; SSE-NEXT: movaps 48(%rdx), %xmm3 7684; SSE-NEXT: paddb (%rdx), %xmm1 7685; SSE-NEXT: paddb 32(%rdx), %xmm0 7686; SSE-NEXT: movaps %xmm3, 48(%rcx) 7687; SSE-NEXT: movaps %xmm2, 16(%rcx) 7688; SSE-NEXT: movdqa %xmm0, 32(%rcx) 7689; SSE-NEXT: movdqa %xmm1, (%rcx) 7690; SSE-NEXT: retq 7691; 7692; AVX-LABEL: vec512_v8i64_to_v2i256_factor4: 7693; AVX: # %bb.0: 7694; AVX-NEXT: vmovdqa (%rdi), %xmm0 7695; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7696; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7697; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 7698; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7699; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7700; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7701; AVX-NEXT: vmovaps 48(%rdx), %xmm3 7702; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7703; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 7704; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7705; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7706; AVX-NEXT: retq 7707; 7708; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4: 7709; AVX2: # %bb.0: 7710; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7711; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7712; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero 7713; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7714; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7715; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 7716; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 7717; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 7718; AVX2-NEXT: vzeroupper 7719; AVX2-NEXT: retq 7720; 7721; AVX512F-LABEL: vec512_v8i64_to_v2i256_factor4: 7722; AVX512F: # %bb.0: 7723; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7724; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7725; AVX512F-NEXT: movb $17, %al 7726; AVX512F-NEXT: kmovw %eax, %k1 7727; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 7728; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7729; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7730; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7731; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7732; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7733; AVX512F-NEXT: vzeroupper 7734; AVX512F-NEXT: retq 7735; 7736; AVX512BW-LABEL: vec512_v8i64_to_v2i256_factor4: 7737; AVX512BW: # %bb.0: 7738; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 7739; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7740; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero 7741; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 7742; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7743; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7744; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7745; AVX512BW-NEXT: vzeroupper 7746; AVX512BW-NEXT: retq 7747 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7748 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7749 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7750 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 7751 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> 7752 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> 7753 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7754 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7755 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7756 ret void 7757} 7758 7759define void @vec512_v8i64_to_v1i512_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7760; SSE-LABEL: vec512_v8i64_to_v1i512_factor8: 7761; SSE: # %bb.0: 7762; SSE-NEXT: movdqa (%rdi), %xmm0 7763; SSE-NEXT: paddb (%rsi), %xmm0 7764; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 7765; SSE-NEXT: movaps 16(%rdx), %xmm1 7766; SSE-NEXT: movaps 32(%rdx), %xmm2 7767; SSE-NEXT: movaps 48(%rdx), %xmm3 7768; SSE-NEXT: paddb (%rdx), %xmm0 7769; SSE-NEXT: movaps %xmm2, 32(%rcx) 7770; SSE-NEXT: movaps %xmm3, 48(%rcx) 7771; SSE-NEXT: movaps %xmm1, 16(%rcx) 7772; SSE-NEXT: movdqa %xmm0, (%rcx) 7773; SSE-NEXT: retq 7774; 7775; AVX-LABEL: vec512_v8i64_to_v1i512_factor8: 7776; AVX: # %bb.0: 7777; AVX-NEXT: vmovdqa (%rdi), %xmm0 7778; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7779; AVX-NEXT: vmovaps 32(%rdx), %ymm1 7780; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7781; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7782; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7783; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7784; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 7785; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7786; AVX-NEXT: vzeroupper 7787; AVX-NEXT: retq 7788; 7789; AVX2-LABEL: vec512_v8i64_to_v1i512_factor8: 7790; AVX2: # %bb.0: 7791; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7792; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7793; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7794; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 7795; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7796; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 7797; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7798; AVX2-NEXT: vzeroupper 7799; AVX2-NEXT: retq 7800; 7801; AVX512F-LABEL: vec512_v8i64_to_v1i512_factor8: 7802; AVX512F: # %bb.0: 7803; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7804; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7805; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7806; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7807; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 7808; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 7809; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7810; AVX512F-NEXT: vzeroupper 7811; AVX512F-NEXT: retq 7812; 7813; AVX512BW-LABEL: vec512_v8i64_to_v1i512_factor8: 7814; AVX512BW: # %bb.0: 7815; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 7816; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 7817; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7818; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7819; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7820; AVX512BW-NEXT: vzeroupper 7821; AVX512BW-NEXT: retq 7822 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7823 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7824 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7825 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 7826 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 7827 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> 7828 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7829 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7830 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7831 ret void 7832} 7833 7834define void @vec512_v4i128_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7835; SSE-LABEL: vec512_v4i128_to_v2i256_factor2: 7836; SSE: # %bb.0: 7837; SSE-NEXT: movdqa (%rdi), %xmm0 7838; SSE-NEXT: movdqa 16(%rdi), %xmm1 7839; SSE-NEXT: paddb 16(%rsi), %xmm1 7840; SSE-NEXT: paddb (%rsi), %xmm0 7841; SSE-NEXT: movaps 16(%rdx), %xmm2 7842; SSE-NEXT: movaps 48(%rdx), %xmm3 7843; SSE-NEXT: paddb (%rdx), %xmm0 7844; SSE-NEXT: paddb 32(%rdx), %xmm1 7845; SSE-NEXT: movaps %xmm3, 48(%rcx) 7846; SSE-NEXT: movaps %xmm2, 16(%rcx) 7847; SSE-NEXT: movdqa %xmm1, 32(%rcx) 7848; SSE-NEXT: movdqa %xmm0, (%rcx) 7849; SSE-NEXT: retq 7850; 7851; AVX-LABEL: vec512_v4i128_to_v2i256_factor2: 7852; AVX: # %bb.0: 7853; AVX-NEXT: vmovdqa (%rdi), %xmm0 7854; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 7855; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7856; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 7857; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 7858; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7859; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7860; AVX-NEXT: vmovaps 48(%rdx), %xmm3 7861; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 7862; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7863; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7864; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7865; AVX-NEXT: retq 7866; 7867; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2: 7868; AVX2: # %bb.0: 7869; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7870; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7871; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 7872; AVX2-NEXT: vmovdqa %xmm0, %xmm0 7873; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7874; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7875; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 7876; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7877; AVX2-NEXT: vzeroupper 7878; AVX2-NEXT: retq 7879; 7880; AVX512F-LABEL: vec512_v4i128_to_v2i256_factor2: 7881; AVX512F: # %bb.0: 7882; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7883; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7884; AVX512F-NEXT: movb $51, %al 7885; AVX512F-NEXT: kmovw %eax, %k1 7886; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 7887; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7888; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 7889; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7890; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7891; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 7892; AVX512F-NEXT: vzeroupper 7893; AVX512F-NEXT: retq 7894; 7895; AVX512BW-LABEL: vec512_v4i128_to_v2i256_factor2: 7896; AVX512BW: # %bb.0: 7897; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 7898; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7899; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1 7900; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 7901; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7902; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7903; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7904; AVX512BW-NEXT: vzeroupper 7905; AVX512BW-NEXT: retq 7906 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7907 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7908 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7909 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> 7910 %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 7911 %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> 7912 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7913 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7914 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7915 ret void 7916} 7917 7918define void @vec512_v4i128_to_v1i512_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7919; SSE-LABEL: vec512_v4i128_to_v1i512_factor4: 7920; SSE: # %bb.0: 7921; SSE-NEXT: movdqa (%rdi), %xmm0 7922; SSE-NEXT: paddb (%rsi), %xmm0 7923; SSE-NEXT: movaps 16(%rdx), %xmm1 7924; SSE-NEXT: movaps 32(%rdx), %xmm2 7925; SSE-NEXT: movaps 48(%rdx), %xmm3 7926; SSE-NEXT: paddb (%rdx), %xmm0 7927; SSE-NEXT: movaps %xmm2, 32(%rcx) 7928; SSE-NEXT: movaps %xmm3, 48(%rcx) 7929; SSE-NEXT: movaps %xmm1, 16(%rcx) 7930; SSE-NEXT: movdqa %xmm0, (%rcx) 7931; SSE-NEXT: retq 7932; 7933; AVX-LABEL: vec512_v4i128_to_v1i512_factor4: 7934; AVX: # %bb.0: 7935; AVX-NEXT: vmovdqa (%rdi), %xmm0 7936; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7937; AVX-NEXT: vmovaps 32(%rdx), %ymm1 7938; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7939; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7940; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7941; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 7942; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7943; AVX-NEXT: vzeroupper 7944; AVX-NEXT: retq 7945; 7946; AVX2-LABEL: vec512_v4i128_to_v1i512_factor4: 7947; AVX2: # %bb.0: 7948; AVX2-NEXT: vmovdqa (%rdi), %xmm0 7949; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7950; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 7951; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7952; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 7953; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7954; AVX2-NEXT: vzeroupper 7955; AVX2-NEXT: retq 7956; 7957; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4: 7958; AVX512F: # %bb.0: 7959; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 7960; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7961; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7962; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 7963; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 7964; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 7965; AVX512F-NEXT: vzeroupper 7966; AVX512F-NEXT: retq 7967; 7968; AVX512BW-LABEL: vec512_v4i128_to_v1i512_factor4: 7969; AVX512BW: # %bb.0: 7970; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 7971; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7972; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7973; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7974; AVX512BW-NEXT: vzeroupper 7975; AVX512BW-NEXT: retq 7976 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7977 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7978 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7979 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> 7980 %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 7981 %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> 7982 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7983 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7984 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7985 ret void 7986} 7987 7988define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7989; SSE-LABEL: vec512_v2i256_to_v1i512_factor2: 7990; SSE: # %bb.0: 7991; SSE-NEXT: movdqa (%rdi), %xmm0 7992; SSE-NEXT: movdqa 16(%rdi), %xmm1 7993; SSE-NEXT: paddb (%rsi), %xmm0 7994; SSE-NEXT: paddb 16(%rsi), %xmm1 7995; SSE-NEXT: movaps 32(%rdx), %xmm2 7996; SSE-NEXT: movaps 48(%rdx), %xmm3 7997; SSE-NEXT: paddb 16(%rdx), %xmm1 7998; SSE-NEXT: paddb (%rdx), %xmm0 7999; SSE-NEXT: movaps %xmm2, 32(%rcx) 8000; SSE-NEXT: movaps %xmm3, 48(%rcx) 8001; SSE-NEXT: movdqa %xmm0, (%rcx) 8002; SSE-NEXT: movdqa %xmm1, 16(%rcx) 8003; SSE-NEXT: retq 8004; 8005; AVX-LABEL: vec512_v2i256_to_v1i512_factor2: 8006; AVX: # %bb.0: 8007; AVX-NEXT: vmovdqa (%rdi), %xmm0 8008; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 8009; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 8010; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 8011; AVX-NEXT: vmovaps 32(%rdx), %ymm2 8012; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 8013; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 8014; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 8015; AVX-NEXT: vmovdqa %xmm0, (%rcx) 8016; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 8017; AVX-NEXT: vzeroupper 8018; AVX-NEXT: retq 8019; 8020; AVX2-LABEL: vec512_v2i256_to_v1i512_factor2: 8021; AVX2: # %bb.0: 8022; AVX2-NEXT: vmovdqa (%rdi), %ymm0 8023; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 8024; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 8025; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 8026; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 8027; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 8028; AVX2-NEXT: vzeroupper 8029; AVX2-NEXT: retq 8030; 8031; AVX512F-LABEL: vec512_v2i256_to_v1i512_factor2: 8032; AVX512F: # %bb.0: 8033; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 8034; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 8035; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 8036; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 8037; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 8038; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 8039; AVX512F-NEXT: vzeroupper 8040; AVX512F-NEXT: retq 8041; 8042; AVX512BW-LABEL: vec512_v2i256_to_v1i512_factor2: 8043; AVX512BW: # %bb.0: 8044; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 8045; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 8046; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 8047; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 8048; AVX512BW-NEXT: vzeroupper 8049; AVX512BW-NEXT: retq 8050 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 8051 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 8052 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 8053 %in.vec.cast = bitcast <64 x i8> %in.vec to <2 x i256> 8054 %zextd.vec = shufflevector <2 x i256> %in.vec.cast, <2 x i256> zeroinitializer, <2 x i32> <i32 0, i32 3> 8055 %out.bytevec = bitcast <2 x i256> %zextd.vec to <64 x i8> 8056 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 8057 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 8058 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 8059 ret void 8060} 8061;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 8062; AVX1-ONLY: {{.*}} 8063; FALLBACK0: {{.*}} 8064; FALLBACK1: {{.*}} 8065; FALLBACK2: {{.*}} 8066; FALLBACK3: {{.*}} 8067; FALLBACK4: {{.*}} 8068; FALLBACK5: {{.*}} 8069; FALLBACK6: {{.*}} 8070; FALLBACK7: {{.*}} 8071; FALLBACK8: {{.*}} 8072; FALLBACK9: {{.*}} 8073