1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12 15; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13 16 17define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 18; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 19; SSE2: # %bb.0: 20; SSE2-NEXT: movdqa (%rdi), %xmm0 21; SSE2-NEXT: paddb (%rsi), %xmm0 22; SSE2-NEXT: pxor %xmm1, %xmm1 23; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 24; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 25; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 26; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7] 27; SSE2-NEXT: packuswb %xmm0, %xmm0 28; SSE2-NEXT: paddb (%rdx), %xmm0 29; SSE2-NEXT: movdqa %xmm0, (%rcx) 30; SSE2-NEXT: retq 31; 32; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 33; SSE42: # %bb.0: 34; SSE42-NEXT: movdqa (%rdi), %xmm0 35; SSE42-NEXT: paddb (%rsi), %xmm0 36; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 37; SSE42-NEXT: paddb (%rdx), %xmm0 38; SSE42-NEXT: movdqa %xmm0, (%rcx) 39; SSE42-NEXT: retq 40; 41; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 42; AVX: # %bb.0: 43; AVX-NEXT: vmovdqa (%rdi), %xmm0 44; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 45; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 46; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 47; AVX-NEXT: vmovdqa %xmm0, (%rcx) 48; AVX-NEXT: retq 49; 50; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 51; AVX2: # %bb.0: 52; AVX2-NEXT: vmovdqa (%rdi), %xmm0 53; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 54; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 55; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 56; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 57; AVX2-NEXT: vzeroupper 58; AVX2-NEXT: retq 59; 60; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 61; AVX512F: # %bb.0: 62; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 63; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 64; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 65; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 66; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 67; AVX512F-NEXT: vzeroupper 68; AVX512F-NEXT: retq 69; 70; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 71; AVX512DQ: # %bb.0: 72; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 73; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 74; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 75; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 76; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 77; AVX512DQ-NEXT: vzeroupper 78; AVX512DQ-NEXT: retq 79; 80; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 81; AVX512BW: # %bb.0: 82; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 83; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 84; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 85; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 86; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 87; AVX512BW-NEXT: vzeroupper 88; AVX512BW-NEXT: retq 89 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 90 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 91 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 92 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 93 %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.aextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 94 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 95 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 96 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 97 ret void 98} 99 100define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 101; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 102; SSE2: # %bb.0: 103; SSE2-NEXT: movdqa (%rdi), %xmm0 104; SSE2-NEXT: paddb (%rsi), %xmm0 105; SSE2-NEXT: pxor %xmm1, %xmm1 106; SSE2-NEXT: movdqa %xmm0, %xmm2 107; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 108; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 109; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 110; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 111; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 112; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 113; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 114; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 115; SSE2-NEXT: packuswb %xmm2, %xmm2 116; SSE2-NEXT: paddb (%rdx), %xmm2 117; SSE2-NEXT: movdqa %xmm2, (%rcx) 118; SSE2-NEXT: retq 119; 120; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 121; SSE42: # %bb.0: 122; SSE42-NEXT: movdqa (%rdi), %xmm0 123; SSE42-NEXT: paddb (%rsi), %xmm0 124; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 125; SSE42-NEXT: paddb (%rdx), %xmm0 126; SSE42-NEXT: movdqa %xmm0, (%rcx) 127; SSE42-NEXT: retq 128; 129; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 130; AVX: # %bb.0: 131; AVX-NEXT: vmovdqa (%rdi), %xmm0 132; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 133; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 134; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 135; AVX-NEXT: vmovdqa %xmm0, (%rcx) 136; AVX-NEXT: retq 137; 138; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 139; AVX2: # %bb.0: 140; AVX2-NEXT: vmovdqa (%rdi), %xmm0 141; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 142; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 143; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 144; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 145; AVX2-NEXT: vzeroupper 146; AVX2-NEXT: retq 147; 148; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 149; AVX512F: # %bb.0: 150; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 151; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 152; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 153; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 154; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 155; AVX512F-NEXT: vzeroupper 156; AVX512F-NEXT: retq 157; 158; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 159; AVX512DQ: # %bb.0: 160; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 161; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 162; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 163; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 164; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 165; AVX512DQ-NEXT: vzeroupper 166; AVX512DQ-NEXT: retq 167; 168; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 169; AVX512BW: # %bb.0: 170; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 171; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 172; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 173; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 174; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 175; AVX512BW-NEXT: vzeroupper 176; AVX512BW-NEXT: retq 177 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 178 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 179 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 180 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 181 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 182 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 183 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 184 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 185 ret void 186} 187 188define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 189; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 190; SSE2: # %bb.0: 191; SSE2-NEXT: movdqa (%rdi), %xmm0 192; SSE2-NEXT: paddb (%rsi), %xmm0 193; SSE2-NEXT: pxor %xmm1, %xmm1 194; SSE2-NEXT: movdqa %xmm0, %xmm2 195; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 196; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] 197; SSE2-NEXT: pand %xmm3, %xmm2 198; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 200; SSE2-NEXT: pandn %xmm0, %xmm3 201; SSE2-NEXT: por %xmm2, %xmm3 202; SSE2-NEXT: packuswb %xmm3, %xmm3 203; SSE2-NEXT: paddb (%rdx), %xmm3 204; SSE2-NEXT: movdqa %xmm3, (%rcx) 205; SSE2-NEXT: retq 206; 207; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 208; SSE42: # %bb.0: 209; SSE42-NEXT: movdqa (%rdi), %xmm0 210; SSE42-NEXT: paddb (%rsi), %xmm0 211; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 212; SSE42-NEXT: paddb (%rdx), %xmm0 213; SSE42-NEXT: movdqa %xmm0, (%rcx) 214; SSE42-NEXT: retq 215; 216; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 217; AVX: # %bb.0: 218; AVX-NEXT: vmovdqa (%rdi), %xmm0 219; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 220; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 221; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 222; AVX-NEXT: vmovdqa %xmm0, (%rcx) 223; AVX-NEXT: retq 224; 225; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 226; AVX2: # %bb.0: 227; AVX2-NEXT: vmovdqa (%rdi), %xmm0 228; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 229; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 230; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 231; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 232; AVX2-NEXT: vzeroupper 233; AVX2-NEXT: retq 234; 235; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 236; AVX512F: # %bb.0: 237; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 238; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 239; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 240; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 241; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 242; AVX512F-NEXT: vzeroupper 243; AVX512F-NEXT: retq 244; 245; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 246; AVX512DQ: # %bb.0: 247; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 248; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 249; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 250; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 251; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 252; AVX512DQ-NEXT: vzeroupper 253; AVX512DQ-NEXT: retq 254; 255; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 256; AVX512BW: # %bb.0: 257; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 258; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 259; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 260; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 261; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 262; AVX512BW-NEXT: vzeroupper 263; AVX512BW-NEXT: retq 264 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 265 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 266 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 267 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 268 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 269 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 270 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 271 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 272 ret void 273} 274 275define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 276; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 277; SSE2: # %bb.0: 278; SSE2-NEXT: movdqa (%rdi), %xmm0 279; SSE2-NEXT: paddb (%rsi), %xmm0 280; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 281; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 282; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7] 283; SSE2-NEXT: paddb (%rdx), %xmm0 284; SSE2-NEXT: movdqa %xmm0, (%rcx) 285; SSE2-NEXT: retq 286; 287; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 288; SSE42: # %bb.0: 289; SSE42-NEXT: movdqa (%rdi), %xmm0 290; SSE42-NEXT: paddb (%rsi), %xmm0 291; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 292; SSE42-NEXT: paddb (%rdx), %xmm0 293; SSE42-NEXT: movdqa %xmm0, (%rcx) 294; SSE42-NEXT: retq 295; 296; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 297; AVX: # %bb.0: 298; AVX-NEXT: vmovdqa (%rdi), %xmm0 299; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 300; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 301; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 302; AVX-NEXT: vmovdqa %xmm0, (%rcx) 303; AVX-NEXT: retq 304; 305; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 306; AVX2: # %bb.0: 307; AVX2-NEXT: vmovdqa (%rdi), %xmm0 308; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 309; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 310; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 311; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 312; AVX2-NEXT: vzeroupper 313; AVX2-NEXT: retq 314; 315; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 316; AVX512F: # %bb.0: 317; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 318; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 319; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 320; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 321; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 322; AVX512F-NEXT: vzeroupper 323; AVX512F-NEXT: retq 324; 325; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 326; AVX512DQ: # %bb.0: 327; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 328; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 329; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 330; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 331; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 332; AVX512DQ-NEXT: vzeroupper 333; AVX512DQ-NEXT: retq 334; 335; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 336; AVX512BW: # %bb.0: 337; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 338; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 339; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 340; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 341; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 342; AVX512BW-NEXT: vzeroupper 343; AVX512BW-NEXT: retq 344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 347 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 348 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 349 %out.bytevec = bitcast <4 x i16> %broadcast.of.aextinreg to <8 x i8> 350 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 354 ret void 355} 356 357define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 358; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 359; SSE2: # %bb.0: 360; SSE2-NEXT: movdqa (%rdi), %xmm0 361; SSE2-NEXT: movdqa 16(%rdi), %xmm1 362; SSE2-NEXT: paddb (%rsi), %xmm0 363; SSE2-NEXT: paddb 16(%rsi), %xmm1 364; SSE2-NEXT: psrlw $8, %xmm1 365; SSE2-NEXT: packuswb %xmm1, %xmm1 366; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 367; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 368; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 369; SSE2-NEXT: paddb (%rdx), %xmm0 370; SSE2-NEXT: movdqa %xmm0, (%rcx) 371; SSE2-NEXT: retq 372; 373; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 374; SSE42: # %bb.0: 375; SSE42-NEXT: movdqa (%rdi), %xmm0 376; SSE42-NEXT: movdqa 16(%rdi), %xmm1 377; SSE42-NEXT: paddb (%rsi), %xmm0 378; SSE42-NEXT: paddb 16(%rsi), %xmm1 379; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 380; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 381; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 382; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 383; SSE42-NEXT: paddb (%rdx), %xmm0 384; SSE42-NEXT: movdqa %xmm0, (%rcx) 385; SSE42-NEXT: retq 386; 387; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 388; AVX: # %bb.0: 389; AVX-NEXT: vmovdqa (%rdi), %xmm0 390; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 391; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 392; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 393; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 394; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 395; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 396; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 397; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 398; AVX-NEXT: vmovdqa %xmm0, (%rcx) 399; AVX-NEXT: retq 400; 401; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 402; AVX2: # %bb.0: 403; AVX2-NEXT: vmovdqa (%rdi), %ymm0 404; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 405; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 406; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 407; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 408; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 409; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 410; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 411; AVX2-NEXT: vzeroupper 412; AVX2-NEXT: retq 413; 414; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 415; AVX512F: # %bb.0: 416; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 417; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 418; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 419; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 420; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 421; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 422; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 423; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 424; AVX512F-NEXT: vzeroupper 425; AVX512F-NEXT: retq 426; 427; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 428; AVX512DQ: # %bb.0: 429; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 430; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 431; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 432; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 433; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 434; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 435; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 436; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 437; AVX512DQ-NEXT: vzeroupper 438; AVX512DQ-NEXT: retq 439; 440; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 441; AVX512BW: # %bb.0: 442; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 443; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 444; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 445; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 446; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 447; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 448; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 449; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 450; AVX512BW-NEXT: vzeroupper 451; AVX512BW-NEXT: retq 452 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 453 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 454 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 455 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 456 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 457 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 458 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 459 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 460 ret void 461} 462 463define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 464; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 465; SSE2: # %bb.0: 466; SSE2-NEXT: movdqa (%rdi), %xmm0 467; SSE2-NEXT: movdqa 16(%rdi), %xmm1 468; SSE2-NEXT: paddb (%rsi), %xmm0 469; SSE2-NEXT: paddb 16(%rsi), %xmm1 470; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 471; SSE2-NEXT: pand %xmm2, %xmm1 472; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 473; SSE2-NEXT: pandn %xmm0, %xmm2 474; SSE2-NEXT: por %xmm1, %xmm2 475; SSE2-NEXT: paddb (%rdx), %xmm2 476; SSE2-NEXT: movdqa %xmm2, (%rcx) 477; SSE2-NEXT: retq 478; 479; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 480; SSE42: # %bb.0: 481; SSE42-NEXT: movdqa (%rdi), %xmm0 482; SSE42-NEXT: movdqa 16(%rdi), %xmm1 483; SSE42-NEXT: paddb 16(%rsi), %xmm1 484; SSE42-NEXT: paddb (%rsi), %xmm0 485; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 486; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 487; SSE42-NEXT: paddb (%rdx), %xmm0 488; SSE42-NEXT: movdqa %xmm0, (%rcx) 489; SSE42-NEXT: retq 490; 491; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 492; AVX: # %bb.0: 493; AVX-NEXT: vmovdqa (%rdi), %xmm0 494; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 495; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 496; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 497; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 498; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 499; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 500; AVX-NEXT: vmovdqa %xmm0, (%rcx) 501; AVX-NEXT: retq 502; 503; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 504; AVX2: # %bb.0: 505; AVX2-NEXT: vmovdqa (%rdi), %ymm0 506; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 507; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 508; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 509; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 510; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 511; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 512; AVX2-NEXT: vzeroupper 513; AVX2-NEXT: retq 514; 515; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 516; AVX512F: # %bb.0: 517; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 518; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 519; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 520; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 521; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 522; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 523; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 524; AVX512F-NEXT: vzeroupper 525; AVX512F-NEXT: retq 526; 527; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 528; AVX512DQ: # %bb.0: 529; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 530; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 531; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 532; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 533; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 534; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 535; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 536; AVX512DQ-NEXT: vzeroupper 537; AVX512DQ-NEXT: retq 538; 539; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 540; AVX512BW: # %bb.0: 541; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 542; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 543; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 544; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 545; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 546; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 547; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 548; AVX512BW-NEXT: vzeroupper 549; AVX512BW-NEXT: retq 550 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 551 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 552 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 553 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 554 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 555 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 556 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 557 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 558 ret void 559} 560 561define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 562; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 563; SSE2: # %bb.0: 564; SSE2-NEXT: movdqa (%rdi), %xmm0 565; SSE2-NEXT: movdqa 16(%rdi), %xmm1 566; SSE2-NEXT: paddb (%rsi), %xmm0 567; SSE2-NEXT: paddb 16(%rsi), %xmm1 568; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 569; SSE2-NEXT: pand %xmm2, %xmm1 570; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 571; SSE2-NEXT: pandn %xmm0, %xmm2 572; SSE2-NEXT: por %xmm1, %xmm2 573; SSE2-NEXT: paddb (%rdx), %xmm2 574; SSE2-NEXT: movdqa %xmm2, (%rcx) 575; SSE2-NEXT: retq 576; 577; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 578; SSE42: # %bb.0: 579; SSE42-NEXT: movdqa (%rdi), %xmm0 580; SSE42-NEXT: movdqa 16(%rdi), %xmm1 581; SSE42-NEXT: paddb 16(%rsi), %xmm1 582; SSE42-NEXT: paddb (%rsi), %xmm0 583; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 584; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 585; SSE42-NEXT: paddb (%rdx), %xmm0 586; SSE42-NEXT: movdqa %xmm0, (%rcx) 587; SSE42-NEXT: retq 588; 589; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 590; AVX: # %bb.0: 591; AVX-NEXT: vmovdqa (%rdi), %xmm0 592; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 593; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 594; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 595; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 596; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 597; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 598; AVX-NEXT: vmovdqa %xmm0, (%rcx) 599; AVX-NEXT: retq 600; 601; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 602; AVX2: # %bb.0: 603; AVX2-NEXT: vmovdqa (%rdi), %ymm0 604; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 605; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 606; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 607; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 608; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 609; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 610; AVX2-NEXT: vzeroupper 611; AVX2-NEXT: retq 612; 613; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 614; AVX512F: # %bb.0: 615; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 616; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 617; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 618; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 619; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 620; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 621; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 622; AVX512F-NEXT: vzeroupper 623; AVX512F-NEXT: retq 624; 625; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 626; AVX512DQ: # %bb.0: 627; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 628; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 629; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 630; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 631; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 632; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 633; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 634; AVX512DQ-NEXT: vzeroupper 635; AVX512DQ-NEXT: retq 636; 637; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 638; AVX512BW: # %bb.0: 639; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 640; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 641; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 642; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 643; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 644; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 645; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 646; AVX512BW-NEXT: vzeroupper 647; AVX512BW-NEXT: retq 648 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 649 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 650 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 651 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 652 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 653 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 654 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 655 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 656 ret void 657} 658 659define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 660; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 661; SSE2: # %bb.0: 662; SSE2-NEXT: movdqa (%rdi), %xmm0 663; SSE2-NEXT: movdqa 16(%rdi), %xmm1 664; SSE2-NEXT: paddb 16(%rsi), %xmm1 665; SSE2-NEXT: paddb (%rsi), %xmm0 666; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 667; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 668; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 669; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 670; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 671; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 672; SSE2-NEXT: paddb (%rdx), %xmm0 673; SSE2-NEXT: movdqa %xmm0, (%rcx) 674; SSE2-NEXT: retq 675; 676; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 677; SSE42: # %bb.0: 678; SSE42-NEXT: movdqa (%rdi), %xmm0 679; SSE42-NEXT: movdqa 16(%rdi), %xmm1 680; SSE42-NEXT: paddb (%rsi), %xmm0 681; SSE42-NEXT: paddb 16(%rsi), %xmm1 682; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 683; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 684; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 685; SSE42-NEXT: paddb (%rdx), %xmm0 686; SSE42-NEXT: movdqa %xmm0, (%rcx) 687; SSE42-NEXT: retq 688; 689; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 690; AVX: # %bb.0: 691; AVX-NEXT: vmovdqa (%rdi), %xmm0 692; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 693; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 694; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 695; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 696; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 697; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 698; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 699; AVX-NEXT: vmovdqa %xmm0, (%rcx) 700; AVX-NEXT: retq 701; 702; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 703; AVX2: # %bb.0: 704; AVX2-NEXT: vmovdqa (%rdi), %ymm0 705; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 706; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 707; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 708; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 709; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 710; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 711; AVX2-NEXT: vzeroupper 712; AVX2-NEXT: retq 713; 714; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 715; AVX512F: # %bb.0: 716; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 717; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 718; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 719; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 720; AVX512F-NEXT: vmovd %xmm0, %eax 721; AVX512F-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0 722; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 723; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 724; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 725; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 726; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] 727; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 728; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 729; AVX512F-NEXT: vzeroupper 730; AVX512F-NEXT: retq 731; 732; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 733; AVX512DQ: # %bb.0: 734; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 735; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 736; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 737; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 738; AVX512DQ-NEXT: vmovd %xmm0, %eax 739; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0 740; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 741; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 742; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 743; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 744; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] 745; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 746; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 747; AVX512DQ-NEXT: vzeroupper 748; AVX512DQ-NEXT: retq 749; 750; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 751; AVX512BW-SLOW: # %bb.0: 752; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 753; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] 754; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 755; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 756; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 757; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 758; AVX512BW-SLOW-NEXT: vzeroupper 759; AVX512BW-SLOW-NEXT: retq 760; 761; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 762; AVX512BW-FAST: # %bb.0: 763; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 764; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] 765; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 766; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 767; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax 768; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 769; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 770; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 771; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 772; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 773; AVX512BW-FAST-NEXT: vzeroupper 774; AVX512BW-FAST-NEXT: retq 775 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 776 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 777 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 778 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 779 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 780 %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8> 781 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 782 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 783 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 784 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 785 ret void 786} 787 788define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 789; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 790; SSE2: # %bb.0: 791; SSE2-NEXT: movdqa (%rdi), %xmm0 792; SSE2-NEXT: movdqa 16(%rdi), %xmm1 793; SSE2-NEXT: paddb (%rsi), %xmm0 794; SSE2-NEXT: paddb 16(%rsi), %xmm1 795; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 796; SSE2-NEXT: pand %xmm2, %xmm1 797; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 798; SSE2-NEXT: pandn %xmm0, %xmm2 799; SSE2-NEXT: por %xmm1, %xmm2 800; SSE2-NEXT: paddb (%rdx), %xmm2 801; SSE2-NEXT: movdqa %xmm2, (%rcx) 802; SSE2-NEXT: retq 803; 804; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 805; SSE42: # %bb.0: 806; SSE42-NEXT: movdqa (%rdi), %xmm0 807; SSE42-NEXT: movdqa 16(%rdi), %xmm1 808; SSE42-NEXT: paddb 16(%rsi), %xmm1 809; SSE42-NEXT: paddb (%rsi), %xmm0 810; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 811; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 812; SSE42-NEXT: paddb (%rdx), %xmm0 813; SSE42-NEXT: movdqa %xmm0, (%rcx) 814; SSE42-NEXT: retq 815; 816; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 817; AVX: # %bb.0: 818; AVX-NEXT: vmovdqa (%rdi), %xmm0 819; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 820; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 821; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 822; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 823; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 824; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 825; AVX-NEXT: vmovdqa %xmm0, (%rcx) 826; AVX-NEXT: retq 827; 828; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 829; AVX2: # %bb.0: 830; AVX2-NEXT: vmovdqa (%rdi), %ymm0 831; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 832; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 833; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 834; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 835; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 836; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 837; AVX2-NEXT: vzeroupper 838; AVX2-NEXT: retq 839; 840; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 841; AVX512F: # %bb.0: 842; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 843; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 844; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 845; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] 846; AVX512F-NEXT: vmovd %xmm0, %eax 847; AVX512F-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0 848; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 849; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 850; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 851; AVX512F-NEXT: vzeroupper 852; AVX512F-NEXT: retq 853; 854; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 855; AVX512DQ: # %bb.0: 856; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 857; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 858; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 859; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] 860; AVX512DQ-NEXT: vmovd %xmm0, %eax 861; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0 862; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 863; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 864; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 865; AVX512DQ-NEXT: vzeroupper 866; AVX512DQ-NEXT: retq 867; 868; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 869; AVX512BW-SLOW: # %bb.0: 870; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 871; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] 872; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 873; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 874; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 875; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 876; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 877; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 878; AVX512BW-SLOW-NEXT: vzeroupper 879; AVX512BW-SLOW-NEXT: retq 880; 881; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 882; AVX512BW-FAST: # %bb.0: 883; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 884; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] 885; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 886; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 887; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 888; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] 889; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 890; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 891; AVX512BW-FAST-NEXT: vzeroupper 892; AVX512BW-FAST-NEXT: retq 893 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 894 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 895 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 896 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 897 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 898 %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8> 899 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 900 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 901 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 902 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 903 ret void 904} 905 906define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 907; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 908; SSE2: # %bb.0: 909; SSE2-NEXT: movdqa (%rdi), %xmm0 910; SSE2-NEXT: movdqa 16(%rdi), %xmm1 911; SSE2-NEXT: paddb (%rsi), %xmm0 912; SSE2-NEXT: paddb 16(%rsi), %xmm1 913; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 914; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 915; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 916; SSE2-NEXT: paddb (%rdx), %xmm0 917; SSE2-NEXT: movdqa %xmm0, (%rcx) 918; SSE2-NEXT: retq 919; 920; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 921; SSE42: # %bb.0: 922; SSE42-NEXT: movdqa (%rdi), %xmm0 923; SSE42-NEXT: movdqa 16(%rdi), %xmm1 924; SSE42-NEXT: paddb 16(%rsi), %xmm1 925; SSE42-NEXT: paddb (%rsi), %xmm0 926; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 927; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 928; SSE42-NEXT: paddb (%rdx), %xmm0 929; SSE42-NEXT: movdqa %xmm0, (%rcx) 930; SSE42-NEXT: retq 931; 932; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 933; AVX: # %bb.0: 934; AVX-NEXT: vmovdqa (%rdi), %xmm0 935; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 936; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 937; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 938; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 939; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 940; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 941; AVX-NEXT: vmovdqa %xmm0, (%rcx) 942; AVX-NEXT: retq 943; 944; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 945; AVX2-SLOW: # %bb.0: 946; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 947; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 948; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 949; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 950; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 951; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 952; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 953; AVX2-SLOW-NEXT: vzeroupper 954; AVX2-SLOW-NEXT: retq 955; 956; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 957; AVX2-FAST-PERLANE: # %bb.0: 958; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 959; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 960; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 961; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0 962; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 963; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 964; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 965; AVX2-FAST-PERLANE-NEXT: vzeroupper 966; AVX2-FAST-PERLANE-NEXT: retq 967; 968; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 969; AVX2-FAST: # %bb.0: 970; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 971; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] 972; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 973; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 974; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 975; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 976; AVX2-FAST-NEXT: vzeroupper 977; AVX2-FAST-NEXT: retq 978; 979; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 980; AVX512F: # %bb.0: 981; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] 982; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 983; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 984; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 985; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 986; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 987; AVX512F-NEXT: vzeroupper 988; AVX512F-NEXT: retq 989; 990; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 991; AVX512DQ: # %bb.0: 992; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] 993; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 994; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 995; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 996; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 997; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 998; AVX512DQ-NEXT: vzeroupper 999; AVX512DQ-NEXT: retq 1000; 1001; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 1002; AVX512BW: # %bb.0: 1003; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1004; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] 1005; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1006; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 1007; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1008; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1009; AVX512BW-NEXT: vzeroupper 1010; AVX512BW-NEXT: retq 1011 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1012 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1013 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1014 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 1015 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 1016 %out.bytevec = bitcast <4 x i32> %broadcast.of.aextinreg to <16 x i8> 1017 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1018 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1019 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1020 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1021 ret void 1022} 1023 1024define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1025; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1026; SSE2: # %bb.0: 1027; SSE2-NEXT: movdqa (%rdi), %xmm0 1028; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1029; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1030; SSE2-NEXT: paddb 48(%rsi), %xmm2 1031; SSE2-NEXT: paddb (%rsi), %xmm0 1032; SSE2-NEXT: paddb 32(%rsi), %xmm1 1033; SSE2-NEXT: psrlw $8, %xmm1 1034; SSE2-NEXT: packuswb %xmm1, %xmm1 1035; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1036; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1037; SSE2-NEXT: movdqa %xmm0, %xmm3 1038; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1039; SSE2-NEXT: psrlw $8, %xmm2 1040; SSE2-NEXT: packuswb %xmm2, %xmm2 1041; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1042; SSE2-NEXT: paddb 16(%rdx), %xmm0 1043; SSE2-NEXT: paddb (%rdx), %xmm3 1044; SSE2-NEXT: movdqa %xmm3, (%rcx) 1045; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1046; SSE2-NEXT: retq 1047; 1048; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1049; SSE42: # %bb.0: 1050; SSE42-NEXT: movdqa (%rdi), %xmm0 1051; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1052; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1053; SSE42-NEXT: paddb 48(%rsi), %xmm2 1054; SSE42-NEXT: paddb (%rsi), %xmm0 1055; SSE42-NEXT: paddb 32(%rsi), %xmm1 1056; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 1057; SSE42-NEXT: pshufb %xmm3, %xmm1 1058; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1059; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1060; SSE42-NEXT: movdqa %xmm0, %xmm4 1061; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 1062; SSE42-NEXT: pshufb %xmm3, %xmm2 1063; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1064; SSE42-NEXT: paddb 16(%rdx), %xmm0 1065; SSE42-NEXT: paddb (%rdx), %xmm4 1066; SSE42-NEXT: movdqa %xmm4, (%rcx) 1067; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1068; SSE42-NEXT: retq 1069; 1070; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1071; AVX: # %bb.0: 1072; AVX-NEXT: vmovdqa (%rdi), %xmm0 1073; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1074; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1075; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1076; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1077; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1078; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 1079; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1080; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1081; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1082; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1083; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1084; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1085; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1086; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1087; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1088; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1089; AVX-NEXT: retq 1090; 1091; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1092; AVX2: # %bb.0: 1093; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1094; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1095; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1096; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1097; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1098; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 1099; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1100; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1101; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1102; AVX2-NEXT: vzeroupper 1103; AVX2-NEXT: retq 1104; 1105; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1106; AVX512F: # %bb.0: 1107; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1108; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1109; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1110; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1111; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1112; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 1113; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1114; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1115; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1116; AVX512F-NEXT: vzeroupper 1117; AVX512F-NEXT: retq 1118; 1119; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1120; AVX512DQ: # %bb.0: 1121; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1122; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1123; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1124; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1125; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1126; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 1127; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1128; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1129; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1130; AVX512DQ-NEXT: vzeroupper 1131; AVX512DQ-NEXT: retq 1132; 1133; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1134; AVX512BW: # %bb.0: 1135; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1136; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1137; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1138; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1139; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 1140; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1141; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1142; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1143; AVX512BW-NEXT: vzeroupper 1144; AVX512BW-NEXT: retq 1145 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1146 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1147 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1148 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63> 1149 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1150 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1151 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1152 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1153 ret void 1154} 1155 1156define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1157; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1158; SSE2: # %bb.0: 1159; SSE2-NEXT: movdqa (%rdi), %xmm0 1160; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1161; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1162; SSE2-NEXT: paddb 48(%rsi), %xmm2 1163; SSE2-NEXT: paddb (%rsi), %xmm0 1164; SSE2-NEXT: paddb 32(%rsi), %xmm1 1165; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 1166; SSE2-NEXT: pand %xmm3, %xmm1 1167; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1168; SSE2-NEXT: pand %xmm3, %xmm2 1169; SSE2-NEXT: pandn %xmm0, %xmm3 1170; SSE2-NEXT: por %xmm3, %xmm1 1171; SSE2-NEXT: por %xmm2, %xmm3 1172; SSE2-NEXT: paddb 16(%rdx), %xmm3 1173; SSE2-NEXT: paddb (%rdx), %xmm1 1174; SSE2-NEXT: movdqa %xmm1, (%rcx) 1175; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1176; SSE2-NEXT: retq 1177; 1178; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1179; SSE42: # %bb.0: 1180; SSE42-NEXT: movdqa (%rdi), %xmm0 1181; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1182; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1183; SSE42-NEXT: paddb 48(%rsi), %xmm2 1184; SSE42-NEXT: paddb 32(%rsi), %xmm1 1185; SSE42-NEXT: paddb (%rsi), %xmm0 1186; SSE42-NEXT: movdqa %xmm0, %xmm3 1187; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] 1188; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 1189; SSE42-NEXT: pshufb %xmm1, %xmm3 1190; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1191; SSE42-NEXT: pshufb %xmm1, %xmm0 1192; SSE42-NEXT: paddb 16(%rdx), %xmm0 1193; SSE42-NEXT: paddb (%rdx), %xmm3 1194; SSE42-NEXT: movdqa %xmm3, (%rcx) 1195; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1196; SSE42-NEXT: retq 1197; 1198; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vmovdqa (%rdi), %xmm0 1201; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1202; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1203; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1204; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1205; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1206; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1207; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 1208; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1209; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1210; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1211; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1212; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1213; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1214; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1215; AVX-NEXT: retq 1216; 1217; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1218; AVX2: # %bb.0: 1219; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1220; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1221; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1222; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1223; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 1224; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 1225; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1226; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1227; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1228; AVX2-NEXT: vzeroupper 1229; AVX2-NEXT: retq 1230; 1231; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1232; AVX512F: # %bb.0: 1233; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1234; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1235; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1236; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1237; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 1238; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1239; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1240; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1241; AVX512F-NEXT: vzeroupper 1242; AVX512F-NEXT: retq 1243; 1244; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1245; AVX512DQ: # %bb.0: 1246; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1247; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1248; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1249; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1250; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1 1251; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1252; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1253; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1254; AVX512DQ-NEXT: vzeroupper 1255; AVX512DQ-NEXT: retq 1256; 1257; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1258; AVX512BW: # %bb.0: 1259; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1260; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1261; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1262; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 1263; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 1264; AVX512BW-NEXT: kmovd %eax, %k1 1265; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1266; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1267; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1268; AVX512BW-NEXT: vzeroupper 1269; AVX512BW-NEXT: retq 1270 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1271 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1272 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1273 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63> 1274 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1275 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1276 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1277 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1278 ret void 1279} 1280 1281define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1282; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1283; SSE2: # %bb.0: 1284; SSE2-NEXT: movdqa (%rdi), %xmm0 1285; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1286; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1287; SSE2-NEXT: paddb 48(%rsi), %xmm2 1288; SSE2-NEXT: paddb (%rsi), %xmm0 1289; SSE2-NEXT: paddb 32(%rsi), %xmm1 1290; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 1291; SSE2-NEXT: pand %xmm3, %xmm1 1292; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1293; SSE2-NEXT: pand %xmm3, %xmm2 1294; SSE2-NEXT: pandn %xmm0, %xmm3 1295; SSE2-NEXT: por %xmm3, %xmm1 1296; SSE2-NEXT: por %xmm2, %xmm3 1297; SSE2-NEXT: paddb 16(%rdx), %xmm3 1298; SSE2-NEXT: paddb (%rdx), %xmm1 1299; SSE2-NEXT: movdqa %xmm1, (%rcx) 1300; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1301; SSE2-NEXT: retq 1302; 1303; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1304; SSE42: # %bb.0: 1305; SSE42-NEXT: movdqa (%rdi), %xmm0 1306; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1307; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1308; SSE42-NEXT: paddb 48(%rsi), %xmm2 1309; SSE42-NEXT: paddb 32(%rsi), %xmm1 1310; SSE42-NEXT: paddb (%rsi), %xmm0 1311; SSE42-NEXT: movdqa %xmm0, %xmm3 1312; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] 1313; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 1314; SSE42-NEXT: pshufb %xmm1, %xmm3 1315; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1316; SSE42-NEXT: pshufb %xmm1, %xmm0 1317; SSE42-NEXT: paddb 16(%rdx), %xmm0 1318; SSE42-NEXT: paddb (%rdx), %xmm3 1319; SSE42-NEXT: movdqa %xmm3, (%rcx) 1320; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1321; SSE42-NEXT: retq 1322; 1323; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1324; AVX: # %bb.0: 1325; AVX-NEXT: vmovdqa (%rdi), %xmm0 1326; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1327; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1328; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1329; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1330; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1331; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1332; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 1333; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1334; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1335; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1336; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1337; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1338; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1339; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1340; AVX-NEXT: retq 1341; 1342; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1343; AVX2: # %bb.0: 1344; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1345; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1346; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1347; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1348; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1349; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 1350; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1351; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1352; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1353; AVX2-NEXT: vzeroupper 1354; AVX2-NEXT: retq 1355; 1356; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1357; AVX512F: # %bb.0: 1358; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1359; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1360; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1361; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1362; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 1363; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1364; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1365; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1366; AVX512F-NEXT: vzeroupper 1367; AVX512F-NEXT: retq 1368; 1369; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1370; AVX512DQ: # %bb.0: 1371; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1372; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1373; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1374; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1375; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 1376; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1377; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1378; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1379; AVX512DQ-NEXT: vzeroupper 1380; AVX512DQ-NEXT: retq 1381; 1382; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1383; AVX512BW: # %bb.0: 1384; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1385; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1386; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1387; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0 1388; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 1389; AVX512BW-NEXT: kmovd %eax, %k1 1390; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1391; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1392; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1393; AVX512BW-NEXT: vzeroupper 1394; AVX512BW-NEXT: retq 1395 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1396 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1397 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1398 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1399 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1400 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1401 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1402 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1403 ret void 1404} 1405 1406define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1407; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1408; SSE2: # %bb.0: 1409; SSE2-NEXT: movdqa (%rdi), %xmm0 1410; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1411; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1412; SSE2-NEXT: paddb 48(%rsi), %xmm2 1413; SSE2-NEXT: paddb 32(%rsi), %xmm1 1414; SSE2-NEXT: paddb (%rsi), %xmm0 1415; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1416; SSE2-NEXT: pand %xmm3, %xmm1 1417; SSE2-NEXT: pand %xmm3, %xmm2 1418; SSE2-NEXT: pandn %xmm0, %xmm3 1419; SSE2-NEXT: por %xmm3, %xmm1 1420; SSE2-NEXT: por %xmm3, %xmm2 1421; SSE2-NEXT: paddb 16(%rdx), %xmm2 1422; SSE2-NEXT: paddb (%rdx), %xmm1 1423; SSE2-NEXT: movdqa %xmm1, (%rcx) 1424; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 1425; SSE2-NEXT: retq 1426; 1427; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1428; SSE42: # %bb.0: 1429; SSE42-NEXT: movdqa (%rdi), %xmm1 1430; SSE42-NEXT: movdqa 32(%rdi), %xmm2 1431; SSE42-NEXT: movdqa 48(%rdi), %xmm3 1432; SSE42-NEXT: paddb 48(%rsi), %xmm3 1433; SSE42-NEXT: paddb 32(%rsi), %xmm2 1434; SSE42-NEXT: paddb (%rsi), %xmm1 1435; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1436; SSE42-NEXT: movdqa %xmm1, %xmm4 1437; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 1438; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1439; SSE42-NEXT: paddb 16(%rdx), %xmm1 1440; SSE42-NEXT: paddb (%rdx), %xmm4 1441; SSE42-NEXT: movdqa %xmm4, (%rcx) 1442; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 1443; SSE42-NEXT: retq 1444; 1445; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1446; AVX: # %bb.0: 1447; AVX-NEXT: vmovdqa (%rdi), %xmm0 1448; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1449; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1450; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1451; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1452; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1453; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] 1454; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 1455; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 1456; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1457; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1458; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1459; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1460; AVX-NEXT: retq 1461; 1462; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1463; AVX2: # %bb.0: 1464; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1465; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1466; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1467; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1468; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1469; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] 1470; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 1471; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1472; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1473; AVX2-NEXT: vzeroupper 1474; AVX2-NEXT: retq 1475; 1476; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1477; AVX512F: # %bb.0: 1478; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1479; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1480; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1481; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1482; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1483; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1484; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 1485; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) 1486; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1487; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1488; AVX512F-NEXT: vzeroupper 1489; AVX512F-NEXT: retq 1490; 1491; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1492; AVX512DQ: # %bb.0: 1493; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1494; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1495; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1496; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1497; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1498; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1499; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 1500; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) 1501; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1502; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1503; AVX512DQ-NEXT: vzeroupper 1504; AVX512DQ-NEXT: retq 1505; 1506; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1507; AVX512BW: # %bb.0: 1508; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1509; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1510; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1511; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1512; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 1513; AVX512BW-NEXT: kmovd %eax, %k1 1514; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1515; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1516; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1517; AVX512BW-NEXT: vzeroupper 1518; AVX512BW-NEXT: retq 1519 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1520 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1521 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1522 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1523 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1524 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1525 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1526 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1527 ret void 1528} 1529 1530define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1531; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1532; SSE2: # %bb.0: 1533; SSE2-NEXT: movdqa (%rdi), %xmm0 1534; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1535; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1536; SSE2-NEXT: paddb 48(%rsi), %xmm2 1537; SSE2-NEXT: paddb 32(%rsi), %xmm1 1538; SSE2-NEXT: paddb (%rsi), %xmm0 1539; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1540; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 1541; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 1542; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1543; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1544; SSE2-NEXT: movdqa %xmm0, %xmm3 1545; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1546; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] 1547; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 1548; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1549; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1550; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1551; SSE2-NEXT: paddb 16(%rdx), %xmm0 1552; SSE2-NEXT: paddb (%rdx), %xmm3 1553; SSE2-NEXT: movdqa %xmm3, (%rcx) 1554; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1555; SSE2-NEXT: retq 1556; 1557; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1558; SSE42: # %bb.0: 1559; SSE42-NEXT: movdqa (%rdi), %xmm0 1560; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1561; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1562; SSE42-NEXT: paddb 48(%rsi), %xmm2 1563; SSE42-NEXT: paddb (%rsi), %xmm0 1564; SSE42-NEXT: paddb 32(%rsi), %xmm1 1565; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 1566; SSE42-NEXT: pshufb %xmm3, %xmm1 1567; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1568; SSE42-NEXT: movdqa %xmm0, %xmm4 1569; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 1570; SSE42-NEXT: pshufb %xmm3, %xmm2 1571; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1572; SSE42-NEXT: paddb 16(%rdx), %xmm0 1573; SSE42-NEXT: paddb (%rdx), %xmm4 1574; SSE42-NEXT: movdqa %xmm4, (%rcx) 1575; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1576; SSE42-NEXT: retq 1577; 1578; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1579; AVX: # %bb.0: 1580; AVX-NEXT: vmovdqa (%rdi), %xmm0 1581; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1582; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1583; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1584; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1585; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1586; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 1587; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1588; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1589; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1590; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1591; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1592; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1593; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1594; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1595; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1596; AVX-NEXT: retq 1597; 1598; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1599; AVX2: # %bb.0: 1600; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1601; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1602; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1603; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1604; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 1605; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1606; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1607; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1608; AVX2-NEXT: vzeroupper 1609; AVX2-NEXT: retq 1610; 1611; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1612; AVX512F: # %bb.0: 1613; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1614; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1615; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1616; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1617; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 1618; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1619; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1620; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1621; AVX512F-NEXT: vzeroupper 1622; AVX512F-NEXT: retq 1623; 1624; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1625; AVX512DQ: # %bb.0: 1626; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1627; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1628; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1629; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1630; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 1631; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1632; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1633; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1634; AVX512DQ-NEXT: vzeroupper 1635; AVX512DQ-NEXT: retq 1636; 1637; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1638; AVX512BW: # %bb.0: 1639; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1640; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1641; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] 1642; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 1643; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1644; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1645; AVX512BW-NEXT: vzeroupper 1646; AVX512BW-NEXT: retq 1647 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1648 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1649 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1650 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1651 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 1652 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8> 1653 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1654 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1655 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1656 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1657 ret void 1658} 1659 1660define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1661; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1662; SSE2: # %bb.0: 1663; SSE2-NEXT: movdqa (%rdi), %xmm0 1664; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1665; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1666; SSE2-NEXT: paddb 48(%rsi), %xmm2 1667; SSE2-NEXT: paddb (%rsi), %xmm0 1668; SSE2-NEXT: paddb 32(%rsi), %xmm1 1669; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] 1670; SSE2-NEXT: pand %xmm3, %xmm1 1671; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1672; SSE2-NEXT: pand %xmm3, %xmm2 1673; SSE2-NEXT: pandn %xmm0, %xmm3 1674; SSE2-NEXT: por %xmm3, %xmm1 1675; SSE2-NEXT: por %xmm2, %xmm3 1676; SSE2-NEXT: paddb 16(%rdx), %xmm3 1677; SSE2-NEXT: paddb (%rdx), %xmm1 1678; SSE2-NEXT: movdqa %xmm1, (%rcx) 1679; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1680; SSE2-NEXT: retq 1681; 1682; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1683; SSE42: # %bb.0: 1684; SSE42-NEXT: movdqa (%rdi), %xmm0 1685; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1686; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1687; SSE42-NEXT: paddb 48(%rsi), %xmm2 1688; SSE42-NEXT: paddb 32(%rsi), %xmm1 1689; SSE42-NEXT: paddb (%rsi), %xmm0 1690; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1691; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 1692; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 1693; SSE42-NEXT: paddb 16(%rdx), %xmm2 1694; SSE42-NEXT: paddb (%rdx), %xmm1 1695; SSE42-NEXT: movdqa %xmm1, (%rcx) 1696; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 1697; SSE42-NEXT: retq 1698; 1699; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1700; AVX: # %bb.0: 1701; AVX-NEXT: vmovdqa (%rdi), %xmm0 1702; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1703; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1704; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1705; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1706; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1707; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1708; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 1709; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 1710; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1711; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1712; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1713; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1714; AVX-NEXT: retq 1715; 1716; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1717; AVX2: # %bb.0: 1718; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1719; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1720; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1721; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1722; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1723; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1724; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1725; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1726; AVX2-NEXT: vzeroupper 1727; AVX2-NEXT: retq 1728; 1729; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1730; AVX512F: # %bb.0: 1731; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1732; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1733; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1734; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1735; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 1736; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1737; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1738; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1739; AVX512F-NEXT: vzeroupper 1740; AVX512F-NEXT: retq 1741; 1742; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1743; AVX512DQ: # %bb.0: 1744; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1745; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1746; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1747; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1748; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 1749; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1750; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1751; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1752; AVX512DQ-NEXT: vzeroupper 1753; AVX512DQ-NEXT: retq 1754; 1755; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1756; AVX512BW: # %bb.0: 1757; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1758; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1759; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1760; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] 1761; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 1762; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 1763; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1764; AVX512BW-NEXT: vzeroupper 1765; AVX512BW-NEXT: retq 1766 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1767 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1768 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1769 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1770 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 1771 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8> 1772 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1773 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1774 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1775 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1776 ret void 1777} 1778 1779define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1780; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1781; SSE2: # %bb.0: 1782; SSE2-NEXT: movdqa (%rdi), %xmm0 1783; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1784; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1785; SSE2-NEXT: paddb 48(%rsi), %xmm2 1786; SSE2-NEXT: paddb 32(%rsi), %xmm1 1787; SSE2-NEXT: paddb (%rsi), %xmm0 1788; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] 1789; SSE2-NEXT: pand %xmm3, %xmm1 1790; SSE2-NEXT: pand %xmm3, %xmm2 1791; SSE2-NEXT: pandn %xmm0, %xmm3 1792; SSE2-NEXT: por %xmm3, %xmm1 1793; SSE2-NEXT: por %xmm3, %xmm2 1794; SSE2-NEXT: paddb 16(%rdx), %xmm2 1795; SSE2-NEXT: paddb (%rdx), %xmm1 1796; SSE2-NEXT: movdqa %xmm1, (%rcx) 1797; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 1798; SSE2-NEXT: retq 1799; 1800; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1801; SSE42: # %bb.0: 1802; SSE42-NEXT: movdqa (%rdi), %xmm0 1803; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1804; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1805; SSE42-NEXT: paddb 48(%rsi), %xmm2 1806; SSE42-NEXT: paddb (%rsi), %xmm0 1807; SSE42-NEXT: paddb 32(%rsi), %xmm1 1808; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1809; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 1810; SSE42-NEXT: paddb 16(%rdx), %xmm0 1811; SSE42-NEXT: paddb (%rdx), %xmm1 1812; SSE42-NEXT: movdqa %xmm1, (%rcx) 1813; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1814; SSE42-NEXT: retq 1815; 1816; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1817; AVX: # %bb.0: 1818; AVX-NEXT: vmovdqa (%rdi), %xmm0 1819; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1820; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1821; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1822; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1823; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1824; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1825; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 1826; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1827; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1828; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1829; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1830; AVX-NEXT: retq 1831; 1832; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1833; AVX2: # %bb.0: 1834; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1835; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1836; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1837; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1838; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1839; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1840; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1841; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1842; AVX2-NEXT: vzeroupper 1843; AVX2-NEXT: retq 1844; 1845; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1846; AVX512F: # %bb.0: 1847; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1848; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1849; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1850; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1851; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1852; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1853; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1854; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1855; AVX512F-NEXT: vzeroupper 1856; AVX512F-NEXT: retq 1857; 1858; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1859; AVX512DQ: # %bb.0: 1860; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1861; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1862; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1863; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1864; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1865; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1866; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1867; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1868; AVX512DQ-NEXT: vzeroupper 1869; AVX512DQ-NEXT: retq 1870; 1871; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1872; AVX512BW: # %bb.0: 1873; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1874; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1875; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1876; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] 1877; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 1878; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 1879; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1880; AVX512BW-NEXT: vzeroupper 1881; AVX512BW-NEXT: retq 1882 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1883 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1884 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1885 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1886 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1887 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8> 1888 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1889 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1890 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1891 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1892 ret void 1893} 1894 1895define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1896; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1897; SSE2: # %bb.0: 1898; SSE2-NEXT: movdqa (%rdi), %xmm0 1899; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1900; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1901; SSE2-NEXT: paddb 48(%rsi), %xmm2 1902; SSE2-NEXT: paddb (%rsi), %xmm0 1903; SSE2-NEXT: paddb 32(%rsi), %xmm1 1904; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1905; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 1906; SSE2-NEXT: movdqa %xmm0, %xmm3 1907; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1908; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 1909; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1910; SSE2-NEXT: paddb 16(%rdx), %xmm0 1911; SSE2-NEXT: paddb (%rdx), %xmm3 1912; SSE2-NEXT: movdqa %xmm3, (%rcx) 1913; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1914; SSE2-NEXT: retq 1915; 1916; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1917; SSE42: # %bb.0: 1918; SSE42-NEXT: movdqa (%rdi), %xmm0 1919; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1920; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1921; SSE42-NEXT: paddb 48(%rsi), %xmm2 1922; SSE42-NEXT: paddb 32(%rsi), %xmm1 1923; SSE42-NEXT: paddb (%rsi), %xmm0 1924; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1925; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1926; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1927; SSE42-NEXT: paddb 16(%rdx), %xmm2 1928; SSE42-NEXT: paddb (%rdx), %xmm1 1929; SSE42-NEXT: movdqa %xmm1, (%rcx) 1930; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 1931; SSE42-NEXT: retq 1932; 1933; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1934; AVX: # %bb.0: 1935; AVX-NEXT: vmovdqa (%rdi), %xmm0 1936; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1937; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1938; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1939; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1940; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1941; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1942; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1943; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] 1944; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1945; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1946; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 1947; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1948; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1949; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 1950; AVX-NEXT: vzeroupper 1951; AVX-NEXT: retq 1952; 1953; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1954; AVX2: # %bb.0: 1955; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1956; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1957; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1958; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1959; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1960; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 1961; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1962; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1963; AVX2-NEXT: vzeroupper 1964; AVX2-NEXT: retq 1965; 1966; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1967; AVX512F-SLOW: # %bb.0: 1968; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 1969; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1970; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 1971; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1972; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 1973; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 1974; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1975; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 1976; AVX512F-SLOW-NEXT: vzeroupper 1977; AVX512F-SLOW-NEXT: retq 1978; 1979; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1980; AVX512F-FAST: # %bb.0: 1981; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 1982; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1983; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 1984; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1985; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] 1986; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 1987; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1988; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 1989; AVX512F-FAST-NEXT: vzeroupper 1990; AVX512F-FAST-NEXT: retq 1991; 1992; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1993; AVX512DQ-SLOW: # %bb.0: 1994; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 1995; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1996; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 1997; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1998; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 1999; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 2000; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2001; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2002; AVX512DQ-SLOW-NEXT: vzeroupper 2003; AVX512DQ-SLOW-NEXT: retq 2004; 2005; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2006; AVX512DQ-FAST: # %bb.0: 2007; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 2008; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2009; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 2010; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2011; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] 2012; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2013; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2014; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2015; AVX512DQ-FAST-NEXT: vzeroupper 2016; AVX512DQ-FAST-NEXT: retq 2017; 2018; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2019; AVX512BW-SLOW: # %bb.0: 2020; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2021; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2022; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2023; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 2024; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2025; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2026; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2027; AVX512BW-SLOW-NEXT: vzeroupper 2028; AVX512BW-SLOW-NEXT: retq 2029; 2030; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2031; AVX512BW-FAST: # %bb.0: 2032; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2033; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] 2034; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2035; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 2036; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2037; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2038; AVX512BW-FAST-NEXT: vzeroupper 2039; AVX512BW-FAST-NEXT: retq 2040 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2041 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2042 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2043 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 2044 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 2045 %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8> 2046 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2047 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2048 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2049 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2050 ret void 2051} 2052 2053define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2054; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2055; SSE2: # %bb.0: 2056; SSE2-NEXT: movdqa (%rdi), %xmm0 2057; SSE2-NEXT: movdqa 32(%rdi), %xmm1 2058; SSE2-NEXT: movdqa 48(%rdi), %xmm2 2059; SSE2-NEXT: paddb 48(%rsi), %xmm2 2060; SSE2-NEXT: paddb (%rsi), %xmm0 2061; SSE2-NEXT: paddb 32(%rsi), %xmm1 2062; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2063; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 2064; SSE2-NEXT: paddb 16(%rdx), %xmm2 2065; SSE2-NEXT: paddb (%rdx), %xmm1 2066; SSE2-NEXT: movdqa %xmm1, (%rcx) 2067; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 2068; SSE2-NEXT: retq 2069; 2070; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2071; SSE42: # %bb.0: 2072; SSE42-NEXT: movdqa (%rdi), %xmm0 2073; SSE42-NEXT: movdqa 32(%rdi), %xmm1 2074; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2075; SSE42-NEXT: paddb 48(%rsi), %xmm2 2076; SSE42-NEXT: paddb (%rsi), %xmm0 2077; SSE42-NEXT: paddb 32(%rsi), %xmm1 2078; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2079; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 2080; SSE42-NEXT: paddb 16(%rdx), %xmm0 2081; SSE42-NEXT: paddb (%rdx), %xmm1 2082; SSE42-NEXT: movdqa %xmm1, (%rcx) 2083; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2084; SSE42-NEXT: retq 2085; 2086; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2087; AVX: # %bb.0: 2088; AVX-NEXT: vmovdqa (%rdi), %xmm0 2089; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 2090; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 2091; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 2092; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 2093; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2094; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2095; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2096; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2097; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2098; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2099; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2100; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2101; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2102; AVX-NEXT: vzeroupper 2103; AVX-NEXT: retq 2104; 2105; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2106; AVX2: # %bb.0: 2107; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2108; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2109; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2110; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2111; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2112; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2113; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2114; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2115; AVX2-NEXT: vzeroupper 2116; AVX2-NEXT: retq 2117; 2118; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2119; AVX512F-SLOW: # %bb.0: 2120; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 2121; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 2122; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2123; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2124; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2125; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2126; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2127; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2128; AVX512F-SLOW-NEXT: vzeroupper 2129; AVX512F-SLOW-NEXT: retq 2130; 2131; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2132; AVX512F-FAST: # %bb.0: 2133; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 2134; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2135; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2136; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2137; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] 2138; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2139; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2140; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2141; AVX512F-FAST-NEXT: vzeroupper 2142; AVX512F-FAST-NEXT: retq 2143; 2144; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2145; AVX512DQ-SLOW: # %bb.0: 2146; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 2147; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 2148; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2149; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2150; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2151; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2152; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2153; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2154; AVX512DQ-SLOW-NEXT: vzeroupper 2155; AVX512DQ-SLOW-NEXT: retq 2156; 2157; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2158; AVX512DQ-FAST: # %bb.0: 2159; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 2160; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2161; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2162; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2163; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] 2164; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2165; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2166; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2167; AVX512DQ-FAST-NEXT: vzeroupper 2168; AVX512DQ-FAST-NEXT: retq 2169; 2170; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2171; AVX512BW-SLOW: # %bb.0: 2172; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2173; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2174; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2175; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2176; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2177; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2178; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2179; AVX512BW-SLOW-NEXT: vzeroupper 2180; AVX512BW-SLOW-NEXT: retq 2181; 2182; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2183; AVX512BW-FAST: # %bb.0: 2184; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2185; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] 2186; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2187; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 2188; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2189; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2190; AVX512BW-FAST-NEXT: vzeroupper 2191; AVX512BW-FAST-NEXT: retq 2192 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2193 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2194 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2195 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 2196 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 2197 %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8> 2198 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2199 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2200 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2201 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2202 ret void 2203} 2204 2205define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2206; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2207; SSE2: # %bb.0: 2208; SSE2-NEXT: movdqa (%rdi), %xmm0 2209; SSE2-NEXT: movdqa 32(%rdi), %xmm1 2210; SSE2-NEXT: movdqa 48(%rdi), %xmm2 2211; SSE2-NEXT: paddb 48(%rsi), %xmm2 2212; SSE2-NEXT: paddb (%rsi), %xmm0 2213; SSE2-NEXT: paddb 32(%rsi), %xmm1 2214; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2215; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 2216; SSE2-NEXT: paddb 16(%rdx), %xmm0 2217; SSE2-NEXT: paddb (%rdx), %xmm1 2218; SSE2-NEXT: movdqa %xmm1, (%rcx) 2219; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2220; SSE2-NEXT: retq 2221; 2222; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2223; SSE42: # %bb.0: 2224; SSE42-NEXT: movdqa (%rdi), %xmm0 2225; SSE42-NEXT: movdqa 32(%rdi), %xmm1 2226; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2227; SSE42-NEXT: paddb 48(%rsi), %xmm2 2228; SSE42-NEXT: paddb 32(%rsi), %xmm1 2229; SSE42-NEXT: paddb (%rsi), %xmm0 2230; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2231; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 2232; SSE42-NEXT: paddb 16(%rdx), %xmm0 2233; SSE42-NEXT: paddb (%rdx), %xmm1 2234; SSE42-NEXT: movdqa %xmm1, (%rcx) 2235; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2236; SSE42-NEXT: retq 2237; 2238; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2239; AVX: # %bb.0: 2240; AVX-NEXT: vmovdqa (%rdi), %xmm0 2241; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 2242; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 2243; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 2244; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 2245; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2246; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2247; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2248; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2249; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2250; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2251; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2252; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2253; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2254; AVX-NEXT: vzeroupper 2255; AVX-NEXT: retq 2256; 2257; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2258; AVX2: # %bb.0: 2259; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 2260; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2261; AVX2-NEXT: vmovdqa (%rdi), %xmm1 2262; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2263; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 2264; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2265; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2266; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2267; AVX2-NEXT: vzeroupper 2268; AVX2-NEXT: retq 2269; 2270; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2271; AVX512F-SLOW: # %bb.0: 2272; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 2273; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2274; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 2275; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2276; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 2277; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2278; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2279; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2280; AVX512F-SLOW-NEXT: vzeroupper 2281; AVX512F-SLOW-NEXT: retq 2282; 2283; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2284; AVX512F-FAST: # %bb.0: 2285; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 2286; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2287; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2288; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2289; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] 2290; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 2291; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2292; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2293; AVX512F-FAST-NEXT: vzeroupper 2294; AVX512F-FAST-NEXT: retq 2295; 2296; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2297; AVX512DQ-SLOW: # %bb.0: 2298; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 2299; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2300; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 2301; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2302; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 2303; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2304; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2305; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2306; AVX512DQ-SLOW-NEXT: vzeroupper 2307; AVX512DQ-SLOW-NEXT: retq 2308; 2309; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2310; AVX512DQ-FAST: # %bb.0: 2311; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 2312; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2313; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2314; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2315; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] 2316; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 2317; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2318; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2319; AVX512DQ-FAST-NEXT: vzeroupper 2320; AVX512DQ-FAST-NEXT: retq 2321; 2322; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2323; AVX512BW-SLOW: # %bb.0: 2324; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2325; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2326; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2327; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 2328; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2329; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2330; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2331; AVX512BW-SLOW-NEXT: vzeroupper 2332; AVX512BW-SLOW-NEXT: retq 2333; 2334; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2335; AVX512BW-FAST: # %bb.0: 2336; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2337; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] 2338; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2339; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 2340; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2341; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2342; AVX512BW-FAST-NEXT: vzeroupper 2343; AVX512BW-FAST-NEXT: retq 2344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2347 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 2348 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 2349 %out.bytevec = bitcast <4 x i64> %broadcast.of.aextinreg to <32 x i8> 2350 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2354 ret void 2355} 2356 2357define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2358; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2359; SSE2: # %bb.0: 2360; SSE2-NEXT: movdqa (%rdi), %xmm0 2361; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2362; SSE2-NEXT: paddb (%rsi), %xmm0 2363; SSE2-NEXT: paddb 48(%rsi), %xmm1 2364; SSE2-NEXT: psrlw $8, %xmm1 2365; SSE2-NEXT: packuswb %xmm1, %xmm1 2366; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 2367; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2368; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2369; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2370; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] 2371; SSE2-NEXT: paddb (%rdx), %xmm0 2372; SSE2-NEXT: movdqa 16(%rdx), %xmm2 2373; SSE2-NEXT: paddb %xmm1, %xmm2 2374; SSE2-NEXT: paddb 32(%rdx), %xmm1 2375; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 2376; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 2377; SSE2-NEXT: movdqa %xmm0, (%rcx) 2378; SSE2-NEXT: retq 2379; 2380; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2381; SSE42: # %bb.0: 2382; SSE42-NEXT: movdqa (%rdi), %xmm0 2383; SSE42-NEXT: movdqa 48(%rdi), %xmm1 2384; SSE42-NEXT: paddb (%rsi), %xmm0 2385; SSE42-NEXT: paddb 48(%rsi), %xmm1 2386; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2387; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 2388; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2389; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2390; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2391; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] 2392; SSE42-NEXT: paddb (%rdx), %xmm0 2393; SSE42-NEXT: movdqa 16(%rdx), %xmm2 2394; SSE42-NEXT: paddb %xmm1, %xmm2 2395; SSE42-NEXT: paddb 32(%rdx), %xmm1 2396; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2397; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 2398; SSE42-NEXT: movdqa %xmm0, (%rcx) 2399; SSE42-NEXT: retq 2400; 2401; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2402; AVX: # %bb.0: 2403; AVX-NEXT: vmovdqa (%rdi), %xmm0 2404; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2405; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2406; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2407; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2408; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2409; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 2410; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2411; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2412; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2413; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2414; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2415; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2416; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2417; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2418; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2419; AVX-NEXT: retq 2420; 2421; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2422; AVX2: # %bb.0: 2423; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2424; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 2425; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2426; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2427; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2428; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 2429; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2430; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2431; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2432; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2433; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2434; AVX2-NEXT: vzeroupper 2435; AVX2-NEXT: retq 2436; 2437; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2438; AVX512F: # %bb.0: 2439; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2440; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2441; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2442; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2443; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2444; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2445; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2446; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2447; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2448; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2449; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2450; AVX512F-NEXT: vzeroupper 2451; AVX512F-NEXT: retq 2452; 2453; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2454; AVX512DQ: # %bb.0: 2455; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2456; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2457; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2458; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2459; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2460; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2461; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2462; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2463; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2464; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2465; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2466; AVX512DQ-NEXT: vzeroupper 2467; AVX512DQ-NEXT: retq 2468; 2469; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2470; AVX512BW: # %bb.0: 2471; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2472; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2473; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2474; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2475; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 2476; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2477; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2478; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2479; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2480; AVX512BW-NEXT: vzeroupper 2481; AVX512BW-NEXT: retq 2482 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2483 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2484 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2485 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95> 2486 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2487 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2488 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2489 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2490 ret void 2491} 2492 2493define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2494; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2495; SSE2: # %bb.0: 2496; SSE2-NEXT: movdqa (%rdi), %xmm0 2497; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2498; SSE2-NEXT: paddb (%rsi), %xmm0 2499; SSE2-NEXT: paddb 48(%rsi), %xmm1 2500; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 2501; SSE2-NEXT: pand %xmm2, %xmm1 2502; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2503; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2504; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2505; SSE2-NEXT: pandn %xmm0, %xmm2 2506; SSE2-NEXT: por %xmm1, %xmm2 2507; SSE2-NEXT: paddb (%rdx), %xmm2 2508; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2509; SSE2-NEXT: paddb %xmm0, %xmm1 2510; SSE2-NEXT: paddb 32(%rdx), %xmm0 2511; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2512; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2513; SSE2-NEXT: movdqa %xmm2, (%rcx) 2514; SSE2-NEXT: retq 2515; 2516; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2517; SSE42: # %bb.0: 2518; SSE42-NEXT: movdqa (%rdi), %xmm0 2519; SSE42-NEXT: movdqa 48(%rdi), %xmm1 2520; SSE42-NEXT: paddb 48(%rsi), %xmm1 2521; SSE42-NEXT: paddb (%rsi), %xmm0 2522; SSE42-NEXT: movdqa %xmm0, %xmm2 2523; SSE42-NEXT: palignr {{.*#+}} xmm2 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 2524; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2525; SSE42-NEXT: pxor %xmm1, %xmm1 2526; SSE42-NEXT: pshufb %xmm1, %xmm0 2527; SSE42-NEXT: paddb (%rdx), %xmm2 2528; SSE42-NEXT: movdqa 16(%rdx), %xmm1 2529; SSE42-NEXT: paddb %xmm0, %xmm1 2530; SSE42-NEXT: paddb 32(%rdx), %xmm0 2531; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 2532; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 2533; SSE42-NEXT: movdqa %xmm2, (%rcx) 2534; SSE42-NEXT: retq 2535; 2536; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2537; AVX: # %bb.0: 2538; AVX-NEXT: vmovdqa (%rdi), %xmm0 2539; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2540; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2541; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2542; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2543; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2544; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 2545; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2546; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2547; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2548; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2549; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2550; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2551; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2552; AVX-NEXT: retq 2553; 2554; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2555; AVX2: # %bb.0: 2556; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2557; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 2558; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2559; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2560; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2561; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2562; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2563; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2564; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2565; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2566; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2567; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2568; AVX2-NEXT: vzeroupper 2569; AVX2-NEXT: retq 2570; 2571; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2572; AVX512F: # %bb.0: 2573; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2574; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2575; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2576; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2577; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2578; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2579; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2580; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2581; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2582; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2583; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2584; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2585; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2586; AVX512F-NEXT: vzeroupper 2587; AVX512F-NEXT: retq 2588; 2589; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2590; AVX512DQ: # %bb.0: 2591; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2592; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2593; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2594; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2595; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2596; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2597; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2598; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2599; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2600; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2601; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2602; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2603; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2604; AVX512DQ-NEXT: vzeroupper 2605; AVX512DQ-NEXT: retq 2606; 2607; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2608; AVX512BW: # %bb.0: 2609; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2610; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2611; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2612; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2613; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2614; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 2615; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2616; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 2617; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2618; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2619; AVX512BW-NEXT: vzeroupper 2620; AVX512BW-NEXT: retq 2621 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2622 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2623 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2624 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95> 2625 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2626 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2627 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2628 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2629 ret void 2630} 2631 2632define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2633; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2634; SSE2: # %bb.0: 2635; SSE2-NEXT: movdqa (%rdi), %xmm0 2636; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2637; SSE2-NEXT: paddb (%rsi), %xmm0 2638; SSE2-NEXT: paddb 48(%rsi), %xmm1 2639; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2640; SSE2-NEXT: pand %xmm2, %xmm1 2641; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2642; SSE2-NEXT: pandn %xmm0, %xmm2 2643; SSE2-NEXT: por %xmm1, %xmm2 2644; SSE2-NEXT: paddb (%rdx), %xmm2 2645; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2646; SSE2-NEXT: paddb %xmm0, %xmm1 2647; SSE2-NEXT: paddb 32(%rdx), %xmm0 2648; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2649; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2650; SSE2-NEXT: movdqa %xmm2, (%rcx) 2651; SSE2-NEXT: retq 2652; 2653; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2654; SSE42: # %bb.0: 2655; SSE42-NEXT: movdqa (%rdi), %xmm0 2656; SSE42-NEXT: movdqa 48(%rdi), %xmm1 2657; SSE42-NEXT: paddb 48(%rsi), %xmm1 2658; SSE42-NEXT: paddb (%rsi), %xmm0 2659; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 2660; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2661; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 2662; SSE42-NEXT: paddb (%rdx), %xmm0 2663; SSE42-NEXT: movdqa 16(%rdx), %xmm1 2664; SSE42-NEXT: paddb %xmm2, %xmm1 2665; SSE42-NEXT: paddb 32(%rdx), %xmm2 2666; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 2667; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 2668; SSE42-NEXT: movdqa %xmm0, (%rcx) 2669; SSE42-NEXT: retq 2670; 2671; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2672; AVX: # %bb.0: 2673; AVX-NEXT: vmovdqa (%rdi), %xmm0 2674; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2675; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2676; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2677; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2678; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 2679; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2680; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2681; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2682; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2683; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2684; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2685; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2686; AVX-NEXT: retq 2687; 2688; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2689; AVX2: # %bb.0: 2690; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2691; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 2692; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2693; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2694; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 2695; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2696; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 2697; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2698; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2699; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2700; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2701; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2702; AVX2-NEXT: vzeroupper 2703; AVX2-NEXT: retq 2704; 2705; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2706; AVX512F: # %bb.0: 2707; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2708; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2709; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2710; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2711; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2 2712; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 2713; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2714; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2715; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2716; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2717; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2718; AVX512F-NEXT: vzeroupper 2719; AVX512F-NEXT: retq 2720; 2721; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2722; AVX512DQ: # %bb.0: 2723; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2724; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2725; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2726; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2727; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2 2728; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 2729; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2730; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2731; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2732; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2733; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2734; AVX512DQ-NEXT: vzeroupper 2735; AVX512DQ-NEXT: retq 2736; 2737; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2738; AVX512BW: # %bb.0: 2739; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2740; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2741; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2742; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 2743; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 2744; AVX512BW-NEXT: kmovd %eax, %k1 2745; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 2746; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 2747; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2748; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2749; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2750; AVX512BW-NEXT: vzeroupper 2751; AVX512BW-NEXT: retq 2752 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2753 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2754 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2755 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95> 2756 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2757 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2758 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2759 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2760 ret void 2761} 2762 2763define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2764; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2765; SSE2: # %bb.0: 2766; SSE2-NEXT: movdqa (%rdi), %xmm0 2767; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2768; SSE2-NEXT: paddb (%rsi), %xmm0 2769; SSE2-NEXT: paddb 48(%rsi), %xmm1 2770; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] 2771; SSE2-NEXT: pand %xmm2, %xmm1 2772; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2773; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2774; SSE2-NEXT: pandn %xmm0, %xmm2 2775; SSE2-NEXT: por %xmm1, %xmm2 2776; SSE2-NEXT: paddb (%rdx), %xmm2 2777; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2778; SSE2-NEXT: paddb %xmm0, %xmm1 2779; SSE2-NEXT: paddb 32(%rdx), %xmm0 2780; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2781; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2782; SSE2-NEXT: movdqa %xmm2, (%rcx) 2783; SSE2-NEXT: retq 2784; 2785; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2786; SSE42: # %bb.0: 2787; SSE42-NEXT: movdqa (%rdi), %xmm0 2788; SSE42-NEXT: movdqa 48(%rdi), %xmm1 2789; SSE42-NEXT: paddb 48(%rsi), %xmm1 2790; SSE42-NEXT: paddb (%rsi), %xmm0 2791; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 2792; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2793; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2794; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] 2795; SSE42-NEXT: paddb (%rdx), %xmm0 2796; SSE42-NEXT: movdqa 16(%rdx), %xmm2 2797; SSE42-NEXT: paddb %xmm1, %xmm2 2798; SSE42-NEXT: paddb 32(%rdx), %xmm1 2799; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2800; SSE42-NEXT: movdqa %xmm0, (%rcx) 2801; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 2802; SSE42-NEXT: retq 2803; 2804; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2805; AVX: # %bb.0: 2806; AVX-NEXT: vmovdqa (%rdi), %xmm0 2807; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2808; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2809; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2810; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2811; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2812; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2813; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2814; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2815; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2816; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2817; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2818; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2819; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2820; AVX-NEXT: retq 2821; 2822; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2823; AVX2: # %bb.0: 2824; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2825; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 2826; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2827; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2828; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2829; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2830; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2831; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2832; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2833; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2834; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2835; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2836; AVX2-NEXT: vzeroupper 2837; AVX2-NEXT: retq 2838; 2839; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2840; AVX512F: # %bb.0: 2841; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2842; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2843; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2844; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2845; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2846; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2847; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2848; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2849; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2850; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2851; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2852; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2853; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2854; AVX512F-NEXT: vzeroupper 2855; AVX512F-NEXT: retq 2856; 2857; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2858; AVX512DQ: # %bb.0: 2859; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2860; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2861; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2862; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2863; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2864; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2865; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2866; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 2867; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2868; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2869; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2870; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2871; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2872; AVX512DQ-NEXT: vzeroupper 2873; AVX512DQ-NEXT: retq 2874; 2875; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2876; AVX512BW: # %bb.0: 2877; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2878; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2879; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2880; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2881; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2882; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 2883; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 2884; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2885; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2886; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2887; AVX512BW-NEXT: vzeroupper 2888; AVX512BW-NEXT: retq 2889 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2890 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2891 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2892 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95> 2893 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2894 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2895 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2896 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2897 ret void 2898} 2899 2900define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2901; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2902; SSE2: # %bb.0: 2903; SSE2-NEXT: movdqa (%rdi), %xmm0 2904; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2905; SSE2-NEXT: paddb (%rsi), %xmm0 2906; SSE2-NEXT: paddb 48(%rsi), %xmm1 2907; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 2908; SSE2-NEXT: pand %xmm2, %xmm1 2909; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2910; SSE2-NEXT: pandn %xmm0, %xmm2 2911; SSE2-NEXT: por %xmm1, %xmm2 2912; SSE2-NEXT: paddb (%rdx), %xmm2 2913; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2914; SSE2-NEXT: paddb %xmm0, %xmm1 2915; SSE2-NEXT: paddb 32(%rdx), %xmm0 2916; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2917; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2918; SSE2-NEXT: movdqa %xmm2, (%rcx) 2919; SSE2-NEXT: retq 2920; 2921; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2922; SSE42: # %bb.0: 2923; SSE42-NEXT: movdqa (%rdi), %xmm0 2924; SSE42-NEXT: movdqa 48(%rdi), %xmm1 2925; SSE42-NEXT: paddb 48(%rsi), %xmm1 2926; SSE42-NEXT: paddb (%rsi), %xmm0 2927; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 2928; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2929; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 2930; SSE42-NEXT: paddb (%rdx), %xmm0 2931; SSE42-NEXT: movdqa 16(%rdx), %xmm1 2932; SSE42-NEXT: paddb %xmm2, %xmm1 2933; SSE42-NEXT: paddb 32(%rdx), %xmm2 2934; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 2935; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 2936; SSE42-NEXT: movdqa %xmm0, (%rcx) 2937; SSE42-NEXT: retq 2938; 2939; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2940; AVX: # %bb.0: 2941; AVX-NEXT: vmovdqa (%rdi), %xmm0 2942; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2943; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2944; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2945; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2946; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 2947; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2948; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2949; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2950; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2951; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2952; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2953; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2954; AVX-NEXT: retq 2955; 2956; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2957; AVX2: # %bb.0: 2958; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2959; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 2960; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2961; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2962; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 2963; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 2964; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 2965; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2966; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2967; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2968; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2969; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2970; AVX2-NEXT: vzeroupper 2971; AVX2-NEXT: retq 2972; 2973; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2974; AVX512F: # %bb.0: 2975; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2976; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2977; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2978; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2979; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 2980; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 2981; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2982; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2983; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2984; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2985; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2986; AVX512F-NEXT: vzeroupper 2987; AVX512F-NEXT: retq 2988; 2989; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2990; AVX512DQ: # %bb.0: 2991; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2992; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2993; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2994; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2995; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 2996; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 2997; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2998; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2999; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3000; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3001; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3002; AVX512DQ-NEXT: vzeroupper 3003; AVX512DQ-NEXT: retq 3004; 3005; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 3006; AVX512BW: # %bb.0: 3007; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3008; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3009; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3010; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 3011; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 3012; AVX512BW-NEXT: kmovd %eax, %k1 3013; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 3014; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 3015; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3016; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3017; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3018; AVX512BW-NEXT: vzeroupper 3019; AVX512BW-NEXT: retq 3020 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3021 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3022 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3023 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3024 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3025 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3026 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3027 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3028 ret void 3029} 3030 3031define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3032; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3033; SSE2: # %bb.0: 3034; SSE2-NEXT: movdqa (%rdi), %xmm0 3035; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3036; SSE2-NEXT: paddb (%rsi), %xmm0 3037; SSE2-NEXT: paddb 48(%rsi), %xmm1 3038; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 3039; SSE2-NEXT: pand %xmm2, %xmm1 3040; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 3041; SSE2-NEXT: pandn %xmm3, %xmm2 3042; SSE2-NEXT: por %xmm1, %xmm2 3043; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 3044; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3045; SSE2-NEXT: paddb (%rdx), %xmm2 3046; SSE2-NEXT: paddb 16(%rdx), %xmm0 3047; SSE2-NEXT: paddb 32(%rdx), %xmm1 3048; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3049; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3050; SSE2-NEXT: movdqa %xmm2, (%rcx) 3051; SSE2-NEXT: retq 3052; 3053; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3054; SSE42: # %bb.0: 3055; SSE42-NEXT: movdqa (%rdi), %xmm0 3056; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3057; SSE42-NEXT: paddb 48(%rsi), %xmm1 3058; SSE42-NEXT: paddb (%rsi), %xmm0 3059; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 3060; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] 3061; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3062; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3063; SSE42-NEXT: paddb (%rdx), %xmm0 3064; SSE42-NEXT: paddb 16(%rdx), %xmm3 3065; SSE42-NEXT: paddb 32(%rdx), %xmm2 3066; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3067; SSE42-NEXT: movdqa %xmm3, 16(%rcx) 3068; SSE42-NEXT: movdqa %xmm0, (%rcx) 3069; SSE42-NEXT: retq 3070; 3071; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3072; AVX: # %bb.0: 3073; AVX-NEXT: vmovdqa (%rdi), %xmm0 3074; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3075; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3076; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3077; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3078; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3079; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 3080; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 3081; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3082; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3083; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3084; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3085; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3086; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3087; AVX-NEXT: retq 3088; 3089; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3090; AVX2: # %bb.0: 3091; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3092; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3093; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3094; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3095; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3096; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3097; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 3098; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3099; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3100; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3101; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3102; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3103; AVX2-NEXT: vzeroupper 3104; AVX2-NEXT: retq 3105; 3106; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3107; AVX512F: # %bb.0: 3108; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3109; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3110; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3111; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3112; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3113; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3114; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 3115; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3116; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3117; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3118; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3119; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3120; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3121; AVX512F-NEXT: vzeroupper 3122; AVX512F-NEXT: retq 3123; 3124; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3125; AVX512DQ: # %bb.0: 3126; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3127; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3128; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3129; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3130; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3131; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3132; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 3133; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3134; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3135; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3136; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3137; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3138; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3139; AVX512DQ-NEXT: vzeroupper 3140; AVX512DQ-NEXT: retq 3141; 3142; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3143; AVX512BW: # %bb.0: 3144; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3145; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3146; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3147; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3148; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3149; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 3150; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3151; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3152; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3153; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3154; AVX512BW-NEXT: vzeroupper 3155; AVX512BW-NEXT: retq 3156 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3157 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3158 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3159 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3160 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3161 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3162 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3163 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3164 ret void 3165} 3166 3167define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3168; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3169; SSE2: # %bb.0: 3170; SSE2-NEXT: movdqa (%rdi), %xmm0 3171; SSE2-NEXT: movdqa 16(%rdi), %xmm1 3172; SSE2-NEXT: movdqa 48(%rdi), %xmm2 3173; SSE2-NEXT: paddb 16(%rsi), %xmm1 3174; SSE2-NEXT: paddb 48(%rsi), %xmm2 3175; SSE2-NEXT: paddb (%rsi), %xmm0 3176; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3177; SSE2-NEXT: pand %xmm3, %xmm2 3178; SSE2-NEXT: pandn %xmm0, %xmm3 3179; SSE2-NEXT: por %xmm2, %xmm3 3180; SSE2-NEXT: paddb (%rdx), %xmm3 3181; SSE2-NEXT: movdqa 16(%rdx), %xmm2 3182; SSE2-NEXT: paddb %xmm0, %xmm2 3183; SSE2-NEXT: paddb 48(%rdx), %xmm1 3184; SSE2-NEXT: paddb 32(%rdx), %xmm0 3185; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3186; SSE2-NEXT: movdqa %xmm1, 48(%rcx) 3187; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 3188; SSE2-NEXT: movdqa %xmm3, (%rcx) 3189; SSE2-NEXT: retq 3190; 3191; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3192; SSE42: # %bb.0: 3193; SSE42-NEXT: movdqa (%rdi), %xmm1 3194; SSE42-NEXT: movdqa 16(%rdi), %xmm2 3195; SSE42-NEXT: movdqa 48(%rdi), %xmm3 3196; SSE42-NEXT: paddb 16(%rsi), %xmm2 3197; SSE42-NEXT: paddb 48(%rsi), %xmm3 3198; SSE42-NEXT: paddb (%rsi), %xmm1 3199; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3200; SSE42-NEXT: movdqa %xmm1, %xmm4 3201; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 3202; SSE42-NEXT: paddb (%rdx), %xmm4 3203; SSE42-NEXT: movdqa 16(%rdx), %xmm0 3204; SSE42-NEXT: paddb %xmm1, %xmm0 3205; SSE42-NEXT: paddb 48(%rdx), %xmm2 3206; SSE42-NEXT: paddb 32(%rdx), %xmm1 3207; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3208; SSE42-NEXT: movdqa %xmm2, 48(%rcx) 3209; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3210; SSE42-NEXT: movdqa %xmm4, (%rcx) 3211; SSE42-NEXT: retq 3212; 3213; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3214; AVX: # %bb.0: 3215; AVX-NEXT: vmovdqa (%rdi), %xmm0 3216; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 3217; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 3218; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 3219; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 3220; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3221; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] 3222; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 3223; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 3224; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 3225; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 3226; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3227; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3228; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 3229; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 3230; AVX-NEXT: vmovdqa %xmm2, (%rcx) 3231; AVX-NEXT: retq 3232; 3233; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3234; AVX2: # %bb.0: 3235; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3236; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3237; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3238; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3239; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3240; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0] 3241; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 3242; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3243; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3244; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3245; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3246; AVX2-NEXT: vzeroupper 3247; AVX2-NEXT: retq 3248; 3249; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3250; AVX512F: # %bb.0: 3251; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 3252; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3253; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3254; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3255; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3256; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3257; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 3258; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) 3259; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1 3260; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3261; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3262; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3263; AVX512F-NEXT: vzeroupper 3264; AVX512F-NEXT: retq 3265; 3266; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3267; AVX512DQ: # %bb.0: 3268; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 3269; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3270; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3271; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3272; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3273; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3274; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 3275; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) 3276; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1 3277; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3278; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3279; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3280; AVX512DQ-NEXT: vzeroupper 3281; AVX512DQ-NEXT: retq 3282; 3283; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3284; AVX512BW: # %bb.0: 3285; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3286; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3287; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3288; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3289; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 3290; AVX512BW-NEXT: kmovd %eax, %k1 3291; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 3292; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3293; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3294; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3295; AVX512BW-NEXT: vzeroupper 3296; AVX512BW-NEXT: retq 3297 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3298 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3299 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3300 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3301 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3302 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3303 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3304 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3305 ret void 3306} 3307 3308define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3309; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3310; SSE2: # %bb.0: 3311; SSE2-NEXT: movdqa (%rdi), %xmm0 3312; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3313; SSE2-NEXT: paddb 48(%rsi), %xmm1 3314; SSE2-NEXT: paddb (%rsi), %xmm0 3315; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3316; SSE2-NEXT: pand %xmm2, %xmm1 3317; SSE2-NEXT: pandn %xmm0, %xmm2 3318; SSE2-NEXT: por %xmm1, %xmm2 3319; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3320; SSE2-NEXT: paddb (%rdx), %xmm2 3321; SSE2-NEXT: paddb 16(%rdx), %xmm0 3322; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3323; SSE2-NEXT: movdqa %xmm2, (%rcx) 3324; SSE2-NEXT: retq 3325; 3326; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3327; SSE42: # %bb.0: 3328; SSE42-NEXT: movdqa (%rdi), %xmm1 3329; SSE42-NEXT: movdqa 48(%rdi), %xmm2 3330; SSE42-NEXT: paddb 48(%rsi), %xmm2 3331; SSE42-NEXT: paddb (%rsi), %xmm1 3332; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3333; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 3334; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 3335; SSE42-NEXT: paddb (%rdx), %xmm1 3336; SSE42-NEXT: paddb 16(%rdx), %xmm3 3337; SSE42-NEXT: movdqa %xmm1, (%rcx) 3338; SSE42-NEXT: movdqa %xmm3, 16(%rcx) 3339; SSE42-NEXT: retq 3340; 3341; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3342; AVX: # %bb.0: 3343; AVX-NEXT: vmovdqa (%rdi), %xmm0 3344; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3345; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3346; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3347; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 3348; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 3349; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3350; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3351; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3352; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3353; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3354; AVX-NEXT: retq 3355; 3356; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3357; AVX2: # %bb.0: 3358; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3359; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3360; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3361; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3362; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 3363; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 3364; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 3365; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3366; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3367; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3368; AVX2-NEXT: vzeroupper 3369; AVX2-NEXT: retq 3370; 3371; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3372; AVX512F: # %bb.0: 3373; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3374; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3375; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3376; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3377; AVX512F-NEXT: vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0)) 3378; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 3379; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3380; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3381; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3382; AVX512F-NEXT: vzeroupper 3383; AVX512F-NEXT: retq 3384; 3385; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3386; AVX512DQ: # %bb.0: 3387; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3388; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3389; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3390; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3391; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0)) 3392; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 3393; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3394; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3395; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 3396; AVX512DQ-NEXT: vzeroupper 3397; AVX512DQ-NEXT: retq 3398; 3399; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3400; AVX512BW: # %bb.0: 3401; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3402; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3403; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3404; AVX512BW-NEXT: movw $1, %ax 3405; AVX512BW-NEXT: kmovd %eax, %k1 3406; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3407; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 3408; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3409; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3410; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3411; AVX512BW-NEXT: vzeroupper 3412; AVX512BW-NEXT: retq 3413 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3414 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3415 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3416 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3417 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3418 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3419 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3420 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3421 ret void 3422} 3423 3424define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3425; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3426; SSE2: # %bb.0: 3427; SSE2-NEXT: movdqa (%rdi), %xmm0 3428; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3429; SSE2-NEXT: paddb 48(%rsi), %xmm1 3430; SSE2-NEXT: paddb (%rsi), %xmm0 3431; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 3432; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 3433; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 3434; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3435; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 3436; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3437; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3438; SSE2-NEXT: paddb (%rdx), %xmm2 3439; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3440; SSE2-NEXT: paddb %xmm0, %xmm1 3441; SSE2-NEXT: paddb 32(%rdx), %xmm0 3442; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3443; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3444; SSE2-NEXT: movdqa %xmm2, (%rcx) 3445; SSE2-NEXT: retq 3446; 3447; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3448; SSE42: # %bb.0: 3449; SSE42-NEXT: movdqa (%rdi), %xmm0 3450; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3451; SSE42-NEXT: paddb (%rsi), %xmm0 3452; SSE42-NEXT: paddb 48(%rsi), %xmm1 3453; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3454; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 3455; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3456; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3457; SSE42-NEXT: paddb (%rdx), %xmm2 3458; SSE42-NEXT: movdqa 16(%rdx), %xmm1 3459; SSE42-NEXT: paddb %xmm0, %xmm1 3460; SSE42-NEXT: paddb 32(%rdx), %xmm0 3461; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 3462; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 3463; SSE42-NEXT: movdqa %xmm2, (%rcx) 3464; SSE42-NEXT: retq 3465; 3466; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3467; AVX: # %bb.0: 3468; AVX-NEXT: vmovdqa (%rdi), %xmm0 3469; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3470; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3471; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3472; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 3473; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3474; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3475; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3476; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 3477; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm1 3478; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3479; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3480; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 3481; AVX-NEXT: vmovdqa %xmm0, (%rcx) 3482; AVX-NEXT: retq 3483; 3484; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3485; AVX2: # %bb.0: 3486; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3487; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3488; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3489; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3490; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 3491; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3492; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3493; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3494; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3495; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3496; AVX2-NEXT: vzeroupper 3497; AVX2-NEXT: retq 3498; 3499; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3500; AVX512F: # %bb.0: 3501; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3502; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3503; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3504; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3505; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 3506; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3507; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3508; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3509; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3510; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3511; AVX512F-NEXT: vzeroupper 3512; AVX512F-NEXT: retq 3513; 3514; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3515; AVX512DQ: # %bb.0: 3516; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3517; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3518; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3519; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3520; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 3521; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3522; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3523; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3524; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3525; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3526; AVX512DQ-NEXT: vzeroupper 3527; AVX512DQ-NEXT: retq 3528; 3529; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3530; AVX512BW: # %bb.0: 3531; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3532; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3533; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31] 3534; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 3535; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 3536; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3537; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3538; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3539; AVX512BW-NEXT: vzeroupper 3540; AVX512BW-NEXT: retq 3541 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3542 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3543 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3544 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3545 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47> 3546 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 3547 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3548 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3549 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3550 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3551 ret void 3552} 3553 3554define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3555; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3556; SSE2: # %bb.0: 3557; SSE2-NEXT: movdqa (%rdi), %xmm0 3558; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3559; SSE2-NEXT: paddb (%rsi), %xmm0 3560; SSE2-NEXT: paddb 48(%rsi), %xmm1 3561; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] 3562; SSE2-NEXT: pand %xmm2, %xmm1 3563; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3564; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3565; SSE2-NEXT: pandn %xmm0, %xmm2 3566; SSE2-NEXT: por %xmm1, %xmm2 3567; SSE2-NEXT: paddb (%rdx), %xmm2 3568; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3569; SSE2-NEXT: paddb %xmm0, %xmm1 3570; SSE2-NEXT: paddb 32(%rdx), %xmm0 3571; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3572; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3573; SSE2-NEXT: movdqa %xmm2, (%rcx) 3574; SSE2-NEXT: retq 3575; 3576; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3577; SSE42: # %bb.0: 3578; SSE42-NEXT: movdqa (%rdi), %xmm0 3579; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3580; SSE42-NEXT: paddb 48(%rsi), %xmm1 3581; SSE42-NEXT: paddb (%rsi), %xmm0 3582; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3583; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3584; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3585; SSE42-NEXT: paddb (%rdx), %xmm1 3586; SSE42-NEXT: movdqa 16(%rdx), %xmm2 3587; SSE42-NEXT: paddb %xmm0, %xmm2 3588; SSE42-NEXT: paddb 32(%rdx), %xmm0 3589; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 3590; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 3591; SSE42-NEXT: movdqa %xmm1, (%rcx) 3592; SSE42-NEXT: retq 3593; 3594; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3595; AVX: # %bb.0: 3596; AVX-NEXT: vmovdqa (%rdi), %xmm0 3597; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3598; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3599; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3600; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3601; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3602; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3603; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3604; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 3605; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3606; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3607; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3608; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3609; AVX-NEXT: retq 3610; 3611; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3612; AVX2: # %bb.0: 3613; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3614; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3615; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3616; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 3617; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3618; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3619; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3620; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3621; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3622; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3623; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3624; AVX2-NEXT: vzeroupper 3625; AVX2-NEXT: retq 3626; 3627; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3628; AVX512F: # %bb.0: 3629; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3630; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3631; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3632; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 3633; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3634; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3635; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3636; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3637; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3638; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3639; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3640; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3641; AVX512F-NEXT: vzeroupper 3642; AVX512F-NEXT: retq 3643; 3644; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3645; AVX512DQ: # %bb.0: 3646; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3647; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3648; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3649; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 3650; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3651; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3652; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3653; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3654; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3655; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3656; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3657; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3658; AVX512DQ-NEXT: vzeroupper 3659; AVX512DQ-NEXT: retq 3660; 3661; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3662; AVX512BW: # %bb.0: 3663; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3664; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3665; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31] 3666; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 3667; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0 3668; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3669; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3670; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3671; AVX512BW-NEXT: vzeroupper 3672; AVX512BW-NEXT: retq 3673 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3674 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3675 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3676 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3677 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47> 3678 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 3679 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3680 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3681 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3682 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3683 ret void 3684} 3685 3686define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3687; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3688; SSE2: # %bb.0: 3689; SSE2-NEXT: movdqa (%rdi), %xmm0 3690; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3691; SSE2-NEXT: paddb (%rsi), %xmm0 3692; SSE2-NEXT: paddb 48(%rsi), %xmm1 3693; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 3694; SSE2-NEXT: pand %xmm2, %xmm1 3695; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3696; SSE2-NEXT: pandn %xmm0, %xmm2 3697; SSE2-NEXT: por %xmm1, %xmm2 3698; SSE2-NEXT: paddb (%rdx), %xmm2 3699; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3700; SSE2-NEXT: paddb %xmm0, %xmm1 3701; SSE2-NEXT: paddb 32(%rdx), %xmm0 3702; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3703; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3704; SSE2-NEXT: movdqa %xmm2, (%rcx) 3705; SSE2-NEXT: retq 3706; 3707; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3708; SSE42: # %bb.0: 3709; SSE42-NEXT: movdqa (%rdi), %xmm0 3710; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3711; SSE42-NEXT: paddb 48(%rsi), %xmm1 3712; SSE42-NEXT: paddb (%rsi), %xmm0 3713; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3714; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 3715; SSE42-NEXT: paddb (%rdx), %xmm1 3716; SSE42-NEXT: movdqa 16(%rdx), %xmm2 3717; SSE42-NEXT: paddb %xmm0, %xmm2 3718; SSE42-NEXT: paddb 32(%rdx), %xmm0 3719; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 3720; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 3721; SSE42-NEXT: movdqa %xmm1, (%rcx) 3722; SSE42-NEXT: retq 3723; 3724; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3725; AVX: # %bb.0: 3726; AVX-NEXT: vmovdqa (%rdi), %xmm0 3727; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3728; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3729; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3730; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3731; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 3732; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3733; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 3734; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3735; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3736; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3737; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3738; AVX-NEXT: retq 3739; 3740; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3741; AVX2: # %bb.0: 3742; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3743; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3744; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3745; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 3746; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3747; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 3748; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 3749; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3750; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 3751; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3752; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 3753; AVX2-NEXT: vzeroupper 3754; AVX2-NEXT: retq 3755; 3756; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3757; AVX512F: # %bb.0: 3758; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3759; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3760; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3761; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3762; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 3763; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 3764; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 3765; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3766; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3767; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3768; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3769; AVX512F-NEXT: vzeroupper 3770; AVX512F-NEXT: retq 3771; 3772; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3773; AVX512DQ: # %bb.0: 3774; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3775; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3776; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3777; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3778; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 3779; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 3780; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 3781; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3782; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3783; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3784; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3785; AVX512DQ-NEXT: vzeroupper 3786; AVX512DQ-NEXT: retq 3787; 3788; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3789; AVX512BW: # %bb.0: 3790; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3791; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3792; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3793; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15] 3794; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 3795; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 3796; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 3797; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 3798; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3799; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3800; AVX512BW-NEXT: vzeroupper 3801; AVX512BW-NEXT: retq 3802 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3803 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3804 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3805 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3806 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47> 3807 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 3808 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3809 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3810 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3811 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3812 ret void 3813} 3814 3815define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3816; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3817; SSE2: # %bb.0: 3818; SSE2-NEXT: movdqa (%rdi), %xmm0 3819; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3820; SSE2-NEXT: paddb (%rsi), %xmm0 3821; SSE2-NEXT: paddb 48(%rsi), %xmm1 3822; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] 3823; SSE2-NEXT: pand %xmm2, %xmm1 3824; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 3825; SSE2-NEXT: pandn %xmm3, %xmm2 3826; SSE2-NEXT: por %xmm1, %xmm2 3827; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 3828; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3829; SSE2-NEXT: paddb (%rdx), %xmm2 3830; SSE2-NEXT: paddb 16(%rdx), %xmm0 3831; SSE2-NEXT: paddb 32(%rdx), %xmm1 3832; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3833; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3834; SSE2-NEXT: movdqa %xmm2, (%rcx) 3835; SSE2-NEXT: retq 3836; 3837; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3838; SSE42: # %bb.0: 3839; SSE42-NEXT: movdqa (%rdi), %xmm0 3840; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3841; SSE42-NEXT: paddb 48(%rsi), %xmm1 3842; SSE42-NEXT: paddb (%rsi), %xmm0 3843; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 3844; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] 3845; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 3846; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3847; SSE42-NEXT: paddb (%rdx), %xmm2 3848; SSE42-NEXT: paddb 16(%rdx), %xmm0 3849; SSE42-NEXT: paddb 32(%rdx), %xmm1 3850; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3851; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3852; SSE42-NEXT: movdqa %xmm2, (%rcx) 3853; SSE42-NEXT: retq 3854; 3855; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3856; AVX: # %bb.0: 3857; AVX-NEXT: vmovdqa (%rdi), %xmm0 3858; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3859; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3860; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3861; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 3862; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 3863; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] 3864; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3865; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3866; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 3867; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3868; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3869; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3870; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3871; AVX-NEXT: retq 3872; 3873; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3874; AVX2: # %bb.0: 3875; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3876; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3877; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3878; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 3879; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3880; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 3881; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3882; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3883; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3884; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3885; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3886; AVX2-NEXT: vzeroupper 3887; AVX2-NEXT: retq 3888; 3889; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3890; AVX512F: # %bb.0: 3891; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3892; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3893; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3894; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 3895; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3896; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3897; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 3898; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3899; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3900; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3901; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3902; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3903; AVX512F-NEXT: vzeroupper 3904; AVX512F-NEXT: retq 3905; 3906; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3907; AVX512DQ: # %bb.0: 3908; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3909; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3910; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3911; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 3912; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 3913; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3914; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 3915; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3916; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3917; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3918; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3919; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3920; AVX512DQ-NEXT: vzeroupper 3921; AVX512DQ-NEXT: retq 3922; 3923; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 3924; AVX512BW: # %bb.0: 3925; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3926; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3927; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3928; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0] 3929; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 3930; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 3931; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 3932; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3933; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3934; AVX512BW-NEXT: vzeroupper 3935; AVX512BW-NEXT: retq 3936 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3937 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3938 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3939 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3940 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47> 3941 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 3942 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3943 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3944 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3945 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3946 ret void 3947} 3948 3949define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3950; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 3951; SSE2: # %bb.0: 3952; SSE2-NEXT: movdqa (%rdi), %xmm0 3953; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3954; SSE2-NEXT: paddb 48(%rsi), %xmm1 3955; SSE2-NEXT: paddb (%rsi), %xmm0 3956; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 3957; SSE2-NEXT: pand %xmm2, %xmm1 3958; SSE2-NEXT: pandn %xmm0, %xmm2 3959; SSE2-NEXT: por %xmm1, %xmm2 3960; SSE2-NEXT: paddb (%rdx), %xmm2 3961; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3962; SSE2-NEXT: paddb %xmm0, %xmm1 3963; SSE2-NEXT: paddb 32(%rdx), %xmm0 3964; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3965; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3966; SSE2-NEXT: movdqa %xmm2, (%rcx) 3967; SSE2-NEXT: retq 3968; 3969; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 3970; SSE42: # %bb.0: 3971; SSE42-NEXT: movdqa (%rdi), %xmm0 3972; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3973; SSE42-NEXT: paddb (%rsi), %xmm0 3974; SSE42-NEXT: paddb 48(%rsi), %xmm1 3975; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 3976; SSE42-NEXT: paddb (%rdx), %xmm1 3977; SSE42-NEXT: movdqa 16(%rdx), %xmm2 3978; SSE42-NEXT: paddb %xmm0, %xmm2 3979; SSE42-NEXT: paddb 32(%rdx), %xmm0 3980; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 3981; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 3982; SSE42-NEXT: movdqa %xmm1, (%rcx) 3983; SSE42-NEXT: retq 3984; 3985; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 3986; AVX: # %bb.0: 3987; AVX-NEXT: vmovdqa (%rdi), %xmm0 3988; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 3989; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 3990; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 3991; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3992; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 3993; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 3994; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 3995; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 3996; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 3997; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3998; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3999; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 4000; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 4001; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4002; AVX-NEXT: retq 4003; 4004; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4005; AVX2: # %bb.0: 4006; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4007; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4008; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4009; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4010; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4011; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4012; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4013; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4014; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4015; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4016; AVX2-NEXT: vzeroupper 4017; AVX2-NEXT: retq 4018; 4019; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4020; AVX512F: # %bb.0: 4021; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4022; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4023; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 4024; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4025; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4026; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4027; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4028; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4029; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4030; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4031; AVX512F-NEXT: vzeroupper 4032; AVX512F-NEXT: retq 4033; 4034; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4035; AVX512DQ: # %bb.0: 4036; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4037; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4038; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 4039; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4040; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4041; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4042; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4043; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4044; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 4045; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 4046; AVX512DQ-NEXT: vzeroupper 4047; AVX512DQ-NEXT: retq 4048; 4049; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4050; AVX512BW: # %bb.0: 4051; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4052; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4053; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4054; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15] 4055; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 4056; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 4057; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 4058; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4059; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4060; AVX512BW-NEXT: vzeroupper 4061; AVX512BW-NEXT: retq 4062 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4063 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4064 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4065 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4066 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4067 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 4068 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4069 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4070 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4071 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4072 ret void 4073} 4074 4075define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4076; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4077; SSE2: # %bb.0: 4078; SSE2-NEXT: movdqa (%rdi), %xmm0 4079; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4080; SSE2-NEXT: paddb 48(%rsi), %xmm1 4081; SSE2-NEXT: paddb (%rsi), %xmm0 4082; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 4083; SSE2-NEXT: pand %xmm2, %xmm1 4084; SSE2-NEXT: pandn %xmm0, %xmm2 4085; SSE2-NEXT: por %xmm1, %xmm2 4086; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4087; SSE2-NEXT: paddb (%rdx), %xmm2 4088; SSE2-NEXT: paddb 16(%rdx), %xmm0 4089; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4090; SSE2-NEXT: movdqa %xmm2, (%rcx) 4091; SSE2-NEXT: retq 4092; 4093; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4094; SSE42: # %bb.0: 4095; SSE42-NEXT: movdqa (%rdi), %xmm0 4096; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4097; SSE42-NEXT: paddb (%rsi), %xmm0 4098; SSE42-NEXT: paddb 48(%rsi), %xmm1 4099; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4100; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4101; SSE42-NEXT: paddb (%rdx), %xmm1 4102; SSE42-NEXT: paddb 16(%rdx), %xmm0 4103; SSE42-NEXT: movdqa %xmm1, (%rcx) 4104; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4105; SSE42-NEXT: retq 4106; 4107; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4108; AVX: # %bb.0: 4109; AVX-NEXT: vmovdqa (%rdi), %xmm0 4110; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4111; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4112; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4113; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4114; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4115; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4116; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4117; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4118; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4119; AVX-NEXT: retq 4120; 4121; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4122; AVX2: # %bb.0: 4123; AVX2-NEXT: vmovdqa (%rdi), %xmm0 4124; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4125; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4126; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4127; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4128; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 4129; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 4130; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4131; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4132; AVX2-NEXT: vzeroupper 4133; AVX2-NEXT: retq 4134; 4135; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4136; AVX512F: # %bb.0: 4137; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 4138; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 4139; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4140; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4141; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4142; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 4143; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 4144; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4145; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4146; AVX512F-NEXT: vzeroupper 4147; AVX512F-NEXT: retq 4148; 4149; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4150; AVX512DQ: # %bb.0: 4151; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 4152; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 4153; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4154; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4155; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4156; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 4157; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 4158; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4159; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4160; AVX512DQ-NEXT: vzeroupper 4161; AVX512DQ-NEXT: retq 4162; 4163; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4164; AVX512BW: # %bb.0: 4165; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4166; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4167; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4168; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0] 4169; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 4170; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 4171; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4172; AVX512BW-NEXT: vzeroupper 4173; AVX512BW-NEXT: retq 4174 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4175 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4176 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4177 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4178 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4179 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8> 4180 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4181 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4182 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4183 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4184 ret void 4185} 4186 4187define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4188; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4189; SSE2: # %bb.0: 4190; SSE2-NEXT: movdqa (%rdi), %xmm0 4191; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4192; SSE2-NEXT: paddb (%rsi), %xmm0 4193; SSE2-NEXT: paddb 48(%rsi), %xmm1 4194; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 4195; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 4196; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4197; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4198; SSE2-NEXT: paddb (%rdx), %xmm2 4199; SSE2-NEXT: movdqa 16(%rdx), %xmm1 4200; SSE2-NEXT: paddb %xmm0, %xmm1 4201; SSE2-NEXT: paddb 32(%rdx), %xmm0 4202; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4203; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 4204; SSE2-NEXT: movdqa %xmm2, (%rcx) 4205; SSE2-NEXT: retq 4206; 4207; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4208; SSE42: # %bb.0: 4209; SSE42-NEXT: movdqa (%rdi), %xmm0 4210; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4211; SSE42-NEXT: paddb 48(%rsi), %xmm1 4212; SSE42-NEXT: paddb (%rsi), %xmm0 4213; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4214; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 4215; SSE42-NEXT: paddb (%rdx), %xmm1 4216; SSE42-NEXT: movdqa 16(%rdx), %xmm2 4217; SSE42-NEXT: paddb %xmm0, %xmm2 4218; SSE42-NEXT: paddb 32(%rdx), %xmm0 4219; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4220; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4221; SSE42-NEXT: movdqa %xmm1, (%rcx) 4222; SSE42-NEXT: retq 4223; 4224; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4225; AVX: # %bb.0: 4226; AVX-NEXT: vmovdqa (%rdi), %xmm0 4227; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4228; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4229; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4230; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 4231; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 4232; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] 4233; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 4234; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 4235; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 4236; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 4237; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 4238; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 4239; AVX-NEXT: vmovdqa %xmm0, (%rcx) 4240; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 4241; AVX-NEXT: vzeroupper 4242; AVX-NEXT: retq 4243; 4244; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4245; AVX2: # %bb.0: 4246; AVX2-NEXT: vmovdqa (%rdi), %xmm0 4247; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4248; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4249; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 4250; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4251; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 4252; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] 4253; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4254; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 4255; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4256; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 4257; AVX2-NEXT: vzeroupper 4258; AVX2-NEXT: retq 4259; 4260; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4261; AVX512F: # %bb.0: 4262; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4263; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4264; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4265; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4266; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4267; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] 4268; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 4269; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4270; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4271; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4272; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4273; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 4274; AVX512F-NEXT: vzeroupper 4275; AVX512F-NEXT: retq 4276; 4277; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4278; AVX512DQ: # %bb.0: 4279; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4280; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4281; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4282; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4283; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4284; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] 4285; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 4286; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4287; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4288; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4289; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4290; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 4291; AVX512DQ-NEXT: vzeroupper 4292; AVX512DQ-NEXT: retq 4293; 4294; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4295; AVX512BW: # %bb.0: 4296; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4297; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4298; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] 4299; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 4300; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4301; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4302; AVX512BW-NEXT: vzeroupper 4303; AVX512BW-NEXT: retq 4304 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4305 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4306 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4307 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4308 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23> 4309 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8> 4310 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4311 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4312 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4313 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4314 ret void 4315} 4316 4317define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4318; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4319; SSE2: # %bb.0: 4320; SSE2-NEXT: movdqa (%rdi), %xmm0 4321; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4322; SSE2-NEXT: paddb 48(%rsi), %xmm1 4323; SSE2-NEXT: paddb (%rsi), %xmm0 4324; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 4325; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] 4326; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2] 4327; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 4328; SSE2-NEXT: paddb (%rdx), %xmm0 4329; SSE2-NEXT: paddb 16(%rdx), %xmm3 4330; SSE2-NEXT: paddb 32(%rdx), %xmm2 4331; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 4332; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 4333; SSE2-NEXT: movdqa %xmm0, (%rcx) 4334; SSE2-NEXT: retq 4335; 4336; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4337; SSE42: # %bb.0: 4338; SSE42-NEXT: movdqa (%rdi), %xmm0 4339; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4340; SSE42-NEXT: paddb 48(%rsi), %xmm1 4341; SSE42-NEXT: paddb (%rsi), %xmm0 4342; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4343; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] 4344; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4345; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4346; SSE42-NEXT: paddb (%rdx), %xmm2 4347; SSE42-NEXT: paddb 16(%rdx), %xmm0 4348; SSE42-NEXT: paddb 32(%rdx), %xmm1 4349; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 4350; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4351; SSE42-NEXT: movdqa %xmm2, (%rcx) 4352; SSE42-NEXT: retq 4353; 4354; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4355; AVX: # %bb.0: 4356; AVX-NEXT: vmovdqa (%rdi), %xmm0 4357; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4358; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4359; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4360; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 4361; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4362; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4363; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4364; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4365; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 4366; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4367; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4368; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 4369; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4370; AVX-NEXT: retq 4371; 4372; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4373; AVX2: # %bb.0: 4374; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4375; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 4376; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4377; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4378; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 4379; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4380; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] 4381; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 4382; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4383; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 4384; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4385; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 4386; AVX2-NEXT: vzeroupper 4387; AVX2-NEXT: retq 4388; 4389; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4390; AVX512F: # %bb.0: 4391; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4392; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4393; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4394; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4395; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4396; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4397; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 4398; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4399; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4400; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4401; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4402; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 4403; AVX512F-NEXT: vzeroupper 4404; AVX512F-NEXT: retq 4405; 4406; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4407; AVX512DQ: # %bb.0: 4408; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4409; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4410; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4411; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4412; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4413; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4414; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 4415; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4416; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4417; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4418; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4419; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 4420; AVX512DQ-NEXT: vzeroupper 4421; AVX512DQ-NEXT: retq 4422; 4423; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4424; AVX512BW-SLOW: # %bb.0: 4425; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4426; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4427; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4428; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 4429; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 4430; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4431; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4432; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4433; AVX512BW-SLOW-NEXT: vzeroupper 4434; AVX512BW-SLOW-NEXT: retq 4435; 4436; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4437; AVX512BW-FAST: # %bb.0: 4438; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4439; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4440; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4441; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 4442; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4443; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4444; AVX512BW-FAST-NEXT: vzeroupper 4445; AVX512BW-FAST-NEXT: retq 4446 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4447 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4448 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4449 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4450 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23> 4451 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8> 4452 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4453 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4454 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4455 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4456 ret void 4457} 4458 4459define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4460; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4461; SSE2: # %bb.0: 4462; SSE2-NEXT: movdqa (%rdi), %xmm0 4463; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4464; SSE2-NEXT: paddb (%rsi), %xmm0 4465; SSE2-NEXT: paddb 48(%rsi), %xmm1 4466; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 4467; SSE2-NEXT: paddb (%rdx), %xmm1 4468; SSE2-NEXT: movdqa 16(%rdx), %xmm2 4469; SSE2-NEXT: paddb %xmm0, %xmm2 4470; SSE2-NEXT: paddb 32(%rdx), %xmm0 4471; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4472; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 4473; SSE2-NEXT: movdqa %xmm1, (%rcx) 4474; SSE2-NEXT: retq 4475; 4476; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4477; SSE42: # %bb.0: 4478; SSE42-NEXT: movdqa (%rdi), %xmm0 4479; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4480; SSE42-NEXT: paddb (%rsi), %xmm0 4481; SSE42-NEXT: paddb 48(%rsi), %xmm1 4482; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4483; SSE42-NEXT: paddb (%rdx), %xmm1 4484; SSE42-NEXT: movdqa 16(%rdx), %xmm2 4485; SSE42-NEXT: paddb %xmm0, %xmm2 4486; SSE42-NEXT: paddb 32(%rdx), %xmm0 4487; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4488; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4489; SSE42-NEXT: movdqa %xmm1, (%rcx) 4490; SSE42-NEXT: retq 4491; 4492; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4493; AVX: # %bb.0: 4494; AVX-NEXT: vmovdqa (%rdi), %xmm0 4495; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 4496; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 4497; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 4498; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4499; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 4500; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 4501; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 4502; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 4503; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 4504; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4505; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4506; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 4507; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 4508; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4509; AVX-NEXT: retq 4510; 4511; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4512; AVX2: # %bb.0: 4513; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4514; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4515; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4516; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4517; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4518; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] 4519; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4520; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4521; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4522; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4523; AVX2-NEXT: vzeroupper 4524; AVX2-NEXT: retq 4525; 4526; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4527; AVX512F: # %bb.0: 4528; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4529; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4530; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4531; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4532; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4533; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 4534; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 4535; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4536; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4537; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4538; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4539; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 4540; AVX512F-NEXT: vzeroupper 4541; AVX512F-NEXT: retq 4542; 4543; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4544; AVX512DQ: # %bb.0: 4545; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4546; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4547; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4548; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4549; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4550; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 4551; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 4552; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4553; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4554; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4555; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4556; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 4557; AVX512DQ-NEXT: vzeroupper 4558; AVX512DQ-NEXT: retq 4559; 4560; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4561; AVX512BW-SLOW: # %bb.0: 4562; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4563; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4564; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 4565; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4566; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] 4567; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4568; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4569; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4570; AVX512BW-SLOW-NEXT: vzeroupper 4571; AVX512BW-SLOW-NEXT: retq 4572; 4573; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4574; AVX512BW-FAST: # %bb.0: 4575; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4576; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] 4577; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4578; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 4579; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4580; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4581; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4582; AVX512BW-FAST-NEXT: vzeroupper 4583; AVX512BW-FAST-NEXT: retq 4584 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4585 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4586 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4587 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4588 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23> 4589 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8> 4590 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4591 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4592 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4593 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4594 ret void 4595} 4596 4597define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4598; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4599; SSE2: # %bb.0: 4600; SSE2-NEXT: movdqa (%rdi), %xmm0 4601; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4602; SSE2-NEXT: paddb (%rsi), %xmm0 4603; SSE2-NEXT: paddb 48(%rsi), %xmm1 4604; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 4605; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4606; SSE2-NEXT: paddb (%rdx), %xmm1 4607; SSE2-NEXT: paddb 16(%rdx), %xmm0 4608; SSE2-NEXT: movdqa %xmm1, (%rcx) 4609; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4610; SSE2-NEXT: retq 4611; 4612; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4613; SSE42: # %bb.0: 4614; SSE42-NEXT: movdqa (%rdi), %xmm0 4615; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4616; SSE42-NEXT: paddb (%rsi), %xmm0 4617; SSE42-NEXT: paddb 48(%rsi), %xmm1 4618; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4619; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4620; SSE42-NEXT: paddb (%rdx), %xmm1 4621; SSE42-NEXT: paddb 16(%rdx), %xmm0 4622; SSE42-NEXT: movdqa %xmm1, (%rcx) 4623; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4624; SSE42-NEXT: retq 4625; 4626; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4627; AVX: # %bb.0: 4628; AVX-NEXT: vmovdqa (%rdi), %xmm0 4629; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4630; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4631; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4632; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4633; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4634; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4635; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4636; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4637; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4638; AVX-NEXT: retq 4639; 4640; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4641; AVX2: # %bb.0: 4642; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4643; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 4644; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4645; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4646; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4647; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] 4648; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 4649; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4650; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4651; AVX2-NEXT: vzeroupper 4652; AVX2-NEXT: retq 4653; 4654; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4655; AVX512F: # %bb.0: 4656; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4657; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4658; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4659; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4660; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4661; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 4662; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 4663; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4664; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4665; AVX512F-NEXT: vzeroupper 4666; AVX512F-NEXT: retq 4667; 4668; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4669; AVX512DQ: # %bb.0: 4670; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4671; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4672; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4673; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4674; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4675; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 4676; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 4677; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4678; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4679; AVX512DQ-NEXT: vzeroupper 4680; AVX512DQ-NEXT: retq 4681; 4682; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4683; AVX512BW-SLOW: # %bb.0: 4684; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4685; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4686; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 4687; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 4688; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4689; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4690; AVX512BW-SLOW-NEXT: vzeroupper 4691; AVX512BW-SLOW-NEXT: retq 4692; 4693; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 4694; AVX512BW-FAST: # %bb.0: 4695; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4696; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] 4697; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4698; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 4699; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] 4700; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4701; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4702; AVX512BW-FAST-NEXT: vzeroupper 4703; AVX512BW-FAST-NEXT: retq 4704 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4705 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4706 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4707 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4708 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23> 4709 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8> 4710 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4711 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4712 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4713 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4714 ret void 4715} 4716 4717define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4718; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4719; SSE2: # %bb.0: 4720; SSE2-NEXT: movdqa (%rdi), %xmm0 4721; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4722; SSE2-NEXT: paddb (%rsi), %xmm0 4723; SSE2-NEXT: paddb 48(%rsi), %xmm1 4724; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4725; SSE2-NEXT: paddb (%rdx), %xmm1 4726; SSE2-NEXT: movdqa 16(%rdx), %xmm2 4727; SSE2-NEXT: paddb %xmm0, %xmm2 4728; SSE2-NEXT: paddb 32(%rdx), %xmm0 4729; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4730; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 4731; SSE2-NEXT: movdqa %xmm1, (%rcx) 4732; SSE2-NEXT: retq 4733; 4734; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4735; SSE42: # %bb.0: 4736; SSE42-NEXT: movdqa (%rdi), %xmm0 4737; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4738; SSE42-NEXT: paddb 48(%rsi), %xmm1 4739; SSE42-NEXT: paddb (%rsi), %xmm0 4740; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4741; SSE42-NEXT: paddb (%rdx), %xmm1 4742; SSE42-NEXT: movdqa 16(%rdx), %xmm2 4743; SSE42-NEXT: paddb %xmm0, %xmm2 4744; SSE42-NEXT: paddb 32(%rdx), %xmm0 4745; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4746; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4747; SSE42-NEXT: movdqa %xmm1, (%rcx) 4748; SSE42-NEXT: retq 4749; 4750; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4751; AVX: # %bb.0: 4752; AVX-NEXT: vmovdqa (%rdi), %xmm0 4753; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 4754; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 4755; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 4756; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 4757; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4758; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 4759; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] 4760; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 4761; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 4762; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 4763; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 4764; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4765; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4766; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 4767; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4768; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 4769; AVX-NEXT: vzeroupper 4770; AVX-NEXT: retq 4771; 4772; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4773; AVX2: # %bb.0: 4774; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4775; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 4776; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4777; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4778; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 4779; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3] 4780; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4781; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4782; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4783; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4784; AVX2-NEXT: vzeroupper 4785; AVX2-NEXT: retq 4786; 4787; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4788; AVX512F: # %bb.0: 4789; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4790; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4791; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4792; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4793; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4794; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] 4795; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 4796; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4797; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4798; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4799; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4800; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 4801; AVX512F-NEXT: vzeroupper 4802; AVX512F-NEXT: retq 4803; 4804; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4805; AVX512DQ: # %bb.0: 4806; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4807; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4808; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4809; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4810; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4811; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] 4812; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 4813; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4814; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 4815; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4816; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4817; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 4818; AVX512DQ-NEXT: vzeroupper 4819; AVX512DQ-NEXT: retq 4820; 4821; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4822; AVX512BW-SLOW: # %bb.0: 4823; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4824; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4825; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 4826; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 4827; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3] 4828; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4829; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4830; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4831; AVX512BW-SLOW-NEXT: vzeroupper 4832; AVX512BW-SLOW-NEXT: retq 4833; 4834; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 4835; AVX512BW-FAST: # %bb.0: 4836; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4837; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] 4838; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4839; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 4840; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4841; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4842; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4843; AVX512BW-FAST-NEXT: vzeroupper 4844; AVX512BW-FAST-NEXT: retq 4845 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4846 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4847 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4848 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 4849 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11> 4850 %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8> 4851 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4852 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4853 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4854 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4855 ret void 4856} 4857 4858define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4859; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4860; SSE2: # %bb.0: 4861; SSE2-NEXT: movdqa (%rdi), %xmm0 4862; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4863; SSE2-NEXT: paddb (%rsi), %xmm0 4864; SSE2-NEXT: paddb 48(%rsi), %xmm1 4865; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 4866; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4867; SSE2-NEXT: paddb (%rdx), %xmm1 4868; SSE2-NEXT: paddb 16(%rdx), %xmm0 4869; SSE2-NEXT: movdqa %xmm1, (%rcx) 4870; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4871; SSE2-NEXT: retq 4872; 4873; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4874; SSE42: # %bb.0: 4875; SSE42-NEXT: movdqa (%rdi), %xmm0 4876; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4877; SSE42-NEXT: paddb 48(%rsi), %xmm1 4878; SSE42-NEXT: paddb (%rsi), %xmm0 4879; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4880; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4881; SSE42-NEXT: paddb (%rdx), %xmm1 4882; SSE42-NEXT: paddb 16(%rdx), %xmm0 4883; SSE42-NEXT: movdqa %xmm1, (%rcx) 4884; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4885; SSE42-NEXT: retq 4886; 4887; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4888; AVX: # %bb.0: 4889; AVX-NEXT: vmovdqa (%rdi), %xmm0 4890; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4891; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4892; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4893; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 4894; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] 4895; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 4896; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 4897; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 4898; AVX-NEXT: vmovdqa %xmm0, (%rcx) 4899; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 4900; AVX-NEXT: vzeroupper 4901; AVX-NEXT: retq 4902; 4903; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4904; AVX2: # %bb.0: 4905; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4906; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 4907; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4908; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4909; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 4910; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0] 4911; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4912; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4913; AVX2-NEXT: vzeroupper 4914; AVX2-NEXT: retq 4915; 4916; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4917; AVX512F: # %bb.0: 4918; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4919; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4920; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4921; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4922; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4923; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] 4924; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 4925; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4926; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4927; AVX512F-NEXT: vzeroupper 4928; AVX512F-NEXT: retq 4929; 4930; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4931; AVX512DQ: # %bb.0: 4932; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4933; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4934; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4935; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4936; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4937; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] 4938; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 4939; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4940; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4941; AVX512DQ-NEXT: vzeroupper 4942; AVX512DQ-NEXT: retq 4943; 4944; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4945; AVX512BW-SLOW: # %bb.0: 4946; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4947; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4948; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] 4949; AVX512BW-SLOW-NEXT: vpermq %zmm0, %zmm1, %zmm0 4950; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4951; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4952; AVX512BW-SLOW-NEXT: vzeroupper 4953; AVX512BW-SLOW-NEXT: retq 4954; 4955; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 4956; AVX512BW-FAST: # %bb.0: 4957; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4958; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] 4959; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4960; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 4961; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] 4962; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4963; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4964; AVX512BW-FAST-NEXT: vzeroupper 4965; AVX512BW-FAST-NEXT: retq 4966 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4967 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4968 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4969 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 4970 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11> 4971 %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8> 4972 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4973 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4974 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4975 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4976 ret void 4977} 4978 4979define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4980; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 4981; SSE: # %bb.0: 4982; SSE-NEXT: movdqa (%rdi), %xmm0 4983; SSE-NEXT: paddb (%rsi), %xmm0 4984; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 4985; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 4986; SSE-NEXT: movdqa 16(%rdx), %xmm1 4987; SSE-NEXT: paddb %xmm0, %xmm1 4988; SSE-NEXT: movdqa (%rdx), %xmm2 4989; SSE-NEXT: paddb %xmm0, %xmm2 4990; SSE-NEXT: movdqa 48(%rdx), %xmm3 4991; SSE-NEXT: paddb %xmm0, %xmm3 4992; SSE-NEXT: paddb 32(%rdx), %xmm0 4993; SSE-NEXT: movdqa %xmm0, 32(%rcx) 4994; SSE-NEXT: movdqa %xmm3, 48(%rcx) 4995; SSE-NEXT: movdqa %xmm2, (%rcx) 4996; SSE-NEXT: movdqa %xmm1, 16(%rcx) 4997; SSE-NEXT: retq 4998; 4999; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5000; AVX: # %bb.0: 5001; AVX-NEXT: vmovdqa (%rdi), %xmm0 5002; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5003; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5004; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5005; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5006; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5007; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5008; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5009; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5010; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5011; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5012; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5013; AVX-NEXT: retq 5014; 5015; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5016; AVX2: # %bb.0: 5017; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5018; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5019; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 5020; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5021; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5022; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5023; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5024; AVX2-NEXT: vzeroupper 5025; AVX2-NEXT: retq 5026; 5027; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5028; AVX512F: # %bb.0: 5029; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5030; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5031; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 5032; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5033; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5034; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5035; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5036; AVX512F-NEXT: vzeroupper 5037; AVX512F-NEXT: retq 5038; 5039; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5040; AVX512DQ: # %bb.0: 5041; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 5042; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5043; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 5044; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5045; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5046; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5047; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5048; AVX512DQ-NEXT: vzeroupper 5049; AVX512DQ-NEXT: retq 5050; 5051; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5052; AVX512BW: # %bb.0: 5053; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5054; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5055; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 5056; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5057; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5058; AVX512BW-NEXT: vzeroupper 5059; AVX512BW-NEXT: retq 5060 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5061 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5062 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5063 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127> 5064 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5065 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias 5066 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5067 ret void 5068} 5069 5070define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5071; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5072; SSE: # %bb.0: 5073; SSE-NEXT: movdqa (%rdi), %xmm0 5074; SSE-NEXT: paddb (%rsi), %xmm0 5075; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5076; SSE-NEXT: movdqa 16(%rdx), %xmm1 5077; SSE-NEXT: paddb %xmm0, %xmm1 5078; SSE-NEXT: movdqa (%rdx), %xmm2 5079; SSE-NEXT: paddb %xmm0, %xmm2 5080; SSE-NEXT: movdqa 48(%rdx), %xmm3 5081; SSE-NEXT: paddb %xmm0, %xmm3 5082; SSE-NEXT: paddb 32(%rdx), %xmm0 5083; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5084; SSE-NEXT: movdqa %xmm3, 48(%rcx) 5085; SSE-NEXT: movdqa %xmm2, (%rcx) 5086; SSE-NEXT: movdqa %xmm1, 16(%rcx) 5087; SSE-NEXT: retq 5088; 5089; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5090; AVX: # %bb.0: 5091; AVX-NEXT: vmovdqa (%rdi), %xmm0 5092; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5093; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5094; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5095; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5096; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5097; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5098; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5099; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5100; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5101; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5102; AVX-NEXT: retq 5103; 5104; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5105; AVX2: # %bb.0: 5106; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5107; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5108; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 5109; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5110; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5111; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5112; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5113; AVX2-NEXT: vzeroupper 5114; AVX2-NEXT: retq 5115; 5116; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5117; AVX512F: # %bb.0: 5118; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5119; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5120; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 5121; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5122; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5123; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5124; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5125; AVX512F-NEXT: vzeroupper 5126; AVX512F-NEXT: retq 5127; 5128; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5129; AVX512DQ: # %bb.0: 5130; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 5131; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5132; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 5133; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5134; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5135; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5136; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5137; AVX512DQ-NEXT: vzeroupper 5138; AVX512DQ-NEXT: retq 5139; 5140; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5141; AVX512BW: # %bb.0: 5142; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5143; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5144; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 5145; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5146; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5147; AVX512BW-NEXT: vzeroupper 5148; AVX512BW-NEXT: retq 5149 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5150 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5151 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5152 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127> 5153 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5154 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias 5155 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5156 ret void 5157} 5158 5159define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5160; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5161; SSE: # %bb.0: 5162; SSE-NEXT: movdqa (%rdi), %xmm0 5163; SSE-NEXT: paddb (%rsi), %xmm0 5164; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5165; SSE-NEXT: movdqa 16(%rdx), %xmm1 5166; SSE-NEXT: paddb %xmm0, %xmm1 5167; SSE-NEXT: movdqa (%rdx), %xmm2 5168; SSE-NEXT: paddb %xmm0, %xmm2 5169; SSE-NEXT: movdqa 48(%rdx), %xmm3 5170; SSE-NEXT: paddb %xmm0, %xmm3 5171; SSE-NEXT: paddb 32(%rdx), %xmm0 5172; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5173; SSE-NEXT: movdqa %xmm3, 48(%rcx) 5174; SSE-NEXT: movdqa %xmm2, (%rcx) 5175; SSE-NEXT: movdqa %xmm1, 16(%rcx) 5176; SSE-NEXT: retq 5177; 5178; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5179; AVX: # %bb.0: 5180; AVX-NEXT: vmovdqa (%rdi), %xmm0 5181; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5182; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5183; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5184; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5185; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5186; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5187; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5188; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5189; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5190; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5191; AVX-NEXT: retq 5192; 5193; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5194; AVX2: # %bb.0: 5195; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5196; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5197; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 5198; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5199; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5200; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5201; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5202; AVX2-NEXT: vzeroupper 5203; AVX2-NEXT: retq 5204; 5205; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5206; AVX512F: # %bb.0: 5207; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5208; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5209; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 5210; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5211; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5212; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5213; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5214; AVX512F-NEXT: vzeroupper 5215; AVX512F-NEXT: retq 5216; 5217; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5218; AVX512DQ: # %bb.0: 5219; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 5220; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5221; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 5222; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5223; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5224; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5225; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5226; AVX512DQ-NEXT: vzeroupper 5227; AVX512DQ-NEXT: retq 5228; 5229; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5230; AVX512BW: # %bb.0: 5231; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5232; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5233; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 5234; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5235; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5236; AVX512BW-NEXT: vzeroupper 5237; AVX512BW-NEXT: retq 5238 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5239 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5240 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5241 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5242 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5243 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias 5244 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5245 ret void 5246} 5247 5248define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5249; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5250; SSE: # %bb.0: 5251; SSE-NEXT: movdqa (%rdi), %xmm0 5252; SSE-NEXT: paddb (%rsi), %xmm0 5253; SSE-NEXT: movdqa 16(%rdx), %xmm1 5254; SSE-NEXT: paddb %xmm0, %xmm1 5255; SSE-NEXT: movdqa (%rdx), %xmm2 5256; SSE-NEXT: paddb %xmm0, %xmm2 5257; SSE-NEXT: movdqa 48(%rdx), %xmm3 5258; SSE-NEXT: paddb %xmm0, %xmm3 5259; SSE-NEXT: paddb 32(%rdx), %xmm0 5260; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5261; SSE-NEXT: movdqa %xmm3, 48(%rcx) 5262; SSE-NEXT: movdqa %xmm2, (%rcx) 5263; SSE-NEXT: movdqa %xmm1, 16(%rcx) 5264; SSE-NEXT: retq 5265; 5266; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5267; AVX: # %bb.0: 5268; AVX-NEXT: vmovdqa (%rdi), %xmm0 5269; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5270; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5271; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5272; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5273; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5274; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5275; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5276; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5277; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5278; AVX-NEXT: retq 5279; 5280; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5281; AVX2: # %bb.0: 5282; AVX2-NEXT: vmovdqa (%rdi), %xmm0 5283; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5284; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 5285; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5286; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5287; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5288; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5289; AVX2-NEXT: vzeroupper 5290; AVX2-NEXT: retq 5291; 5292; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5293; AVX512F: # %bb.0: 5294; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 5295; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5296; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 5297; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5298; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5299; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5300; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5301; AVX512F-NEXT: vzeroupper 5302; AVX512F-NEXT: retq 5303; 5304; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5305; AVX512DQ: # %bb.0: 5306; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 5307; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5308; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 5309; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5310; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5311; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5312; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5313; AVX512DQ-NEXT: vzeroupper 5314; AVX512DQ-NEXT: retq 5315; 5316; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5317; AVX512BW: # %bb.0: 5318; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5319; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5320; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 5321; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5322; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5323; AVX512BW-NEXT: vzeroupper 5324; AVX512BW-NEXT: retq 5325 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5326 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5327 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5328 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5329 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5330 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias 5331 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5332 ret void 5333} 5334 5335define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5336; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5337; SSE: # %bb.0: 5338; SSE-NEXT: movdqa (%rdi), %xmm0 5339; SSE-NEXT: movdqa 16(%rdi), %xmm1 5340; SSE-NEXT: paddb (%rsi), %xmm0 5341; SSE-NEXT: paddb 16(%rsi), %xmm1 5342; SSE-NEXT: movdqa 16(%rdx), %xmm2 5343; SSE-NEXT: paddb %xmm1, %xmm2 5344; SSE-NEXT: movdqa (%rdx), %xmm3 5345; SSE-NEXT: paddb %xmm0, %xmm3 5346; SSE-NEXT: paddb 48(%rdx), %xmm1 5347; SSE-NEXT: paddb 32(%rdx), %xmm0 5348; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5349; SSE-NEXT: movdqa %xmm1, 48(%rcx) 5350; SSE-NEXT: movdqa %xmm3, (%rcx) 5351; SSE-NEXT: movdqa %xmm2, 16(%rcx) 5352; SSE-NEXT: retq 5353; 5354; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5355; AVX: # %bb.0: 5356; AVX-NEXT: vmovdqa (%rdi), %xmm0 5357; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 5358; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5359; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 5360; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 5361; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 5362; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 5363; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5364; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5365; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 5366; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5367; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 5368; AVX-NEXT: retq 5369; 5370; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5371; AVX2: # %bb.0: 5372; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5373; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5374; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5375; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5376; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5377; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5378; AVX2-NEXT: vzeroupper 5379; AVX2-NEXT: retq 5380; 5381; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5382; AVX512F: # %bb.0: 5383; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5384; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5385; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5386; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5387; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5388; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5389; AVX512F-NEXT: vzeroupper 5390; AVX512F-NEXT: retq 5391; 5392; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5393; AVX512DQ: # %bb.0: 5394; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5395; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5396; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5397; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5398; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5399; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5400; AVX512DQ-NEXT: vzeroupper 5401; AVX512DQ-NEXT: retq 5402; 5403; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5404; AVX512BW: # %bb.0: 5405; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 5406; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5407; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 5408; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5409; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5410; AVX512BW-NEXT: vzeroupper 5411; AVX512BW-NEXT: retq 5412 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5413 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5414 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5415 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5416 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5417 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias 5418 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5419 ret void 5420} 5421 5422; FIXME: all these crash during selection: 5423; define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5424; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5425; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5426; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5427; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 5428; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63> 5429; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8> 5430; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5431; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5432; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5433; ret void 5434; } 5435; 5436; define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5437; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5438; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5439; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5440; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 5441; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63> 5442; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8> 5443; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5444; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5445; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5446; ret void 5447; } 5448; 5449; define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5450; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5451; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5452; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5453; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 5454; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5455; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8> 5456; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5457; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5458; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5459; ret void 5460; } 5461; 5462; define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5463; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5464; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5465; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5466; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 5467; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5468; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8> 5469; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5470; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5471; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5472; ret void 5473; } 5474; 5475; define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5476; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5477; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5478; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5479; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 5480; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 5481; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8> 5482; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5483; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5484; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5485; ret void 5486; } 5487; 5488; define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5489; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5490; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5491; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5492; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 5493; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 5494; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8> 5495; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5496; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5497; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5498; ret void 5499; } 5500; 5501; define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5502; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5503; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5504; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5505; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 5506; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5507; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8> 5508; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5509; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5510; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5511; ret void 5512; } 5513; 5514; define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5515; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5516; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5517; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5518; %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 5519; %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 5520; %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8> 5521; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5522; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5523; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5524; ret void 5525; } 5526; 5527; define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5528; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5529; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5530; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5531; %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 5532; %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 5533; %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8> 5534; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5535; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5536; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5537; ret void 5538; } 5539 5540define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5541; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5542; SSE: # %bb.0: 5543; SSE-NEXT: movdqa (%rdi), %xmm0 5544; SSE-NEXT: movdqa 16(%rdi), %xmm1 5545; SSE-NEXT: paddb (%rsi), %xmm0 5546; SSE-NEXT: paddb 16(%rsi), %xmm1 5547; SSE-NEXT: movdqa 16(%rdx), %xmm2 5548; SSE-NEXT: paddb %xmm1, %xmm2 5549; SSE-NEXT: movdqa (%rdx), %xmm3 5550; SSE-NEXT: paddb %xmm0, %xmm3 5551; SSE-NEXT: paddb 48(%rdx), %xmm1 5552; SSE-NEXT: paddb 32(%rdx), %xmm0 5553; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5554; SSE-NEXT: movdqa %xmm1, 48(%rcx) 5555; SSE-NEXT: movdqa %xmm3, (%rcx) 5556; SSE-NEXT: movdqa %xmm2, 16(%rcx) 5557; SSE-NEXT: retq 5558; 5559; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5560; AVX: # %bb.0: 5561; AVX-NEXT: vmovdqa (%rdi), %xmm0 5562; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 5563; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5564; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 5565; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 5566; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 5567; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 5568; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5569; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5570; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 5571; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5572; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 5573; AVX-NEXT: retq 5574; 5575; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5576; AVX2: # %bb.0: 5577; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5578; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5579; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5580; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5581; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5582; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5583; AVX2-NEXT: vzeroupper 5584; AVX2-NEXT: retq 5585; 5586; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5587; AVX512F: # %bb.0: 5588; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5589; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5590; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5591; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5592; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5593; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5594; AVX512F-NEXT: vzeroupper 5595; AVX512F-NEXT: retq 5596; 5597; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5598; AVX512DQ: # %bb.0: 5599; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5600; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5601; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5602; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5603; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5604; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5605; AVX512DQ-NEXT: vzeroupper 5606; AVX512DQ-NEXT: retq 5607; 5608; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 5609; AVX512BW: # %bb.0: 5610; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5611; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5612; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 5613; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5614; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5615; AVX512BW-NEXT: vzeroupper 5616; AVX512BW-NEXT: retq 5617 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5618 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5619 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5620 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> 5621 %broadcast.of.aextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 5622 %out.bytevec = bitcast <4 x i128> %broadcast.of.aextinreg to <64 x i8> 5623 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5624 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 5625 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5626 ret void 5627} 5628;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 5629; AVX1-ONLY: {{.*}} 5630; FALLBACK0: {{.*}} 5631; FALLBACK1: {{.*}} 5632; FALLBACK10: {{.*}} 5633; FALLBACK11: {{.*}} 5634; FALLBACK12: {{.*}} 5635; FALLBACK13: {{.*}} 5636; FALLBACK2: {{.*}} 5637; FALLBACK3: {{.*}} 5638; FALLBACK4: {{.*}} 5639; FALLBACK5: {{.*}} 5640; FALLBACK6: {{.*}} 5641; FALLBACK7: {{.*}} 5642; FALLBACK8: {{.*}} 5643; FALLBACK9: {{.*}} 5644