1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12 15; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13 16 17define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 18; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 19; SSE2: # %bb.0: 20; SSE2-NEXT: movdqa (%rdi), %xmm0 21; SSE2-NEXT: paddb (%rsi), %xmm0 22; SSE2-NEXT: pxor %xmm1, %xmm1 23; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 24; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 25; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 26; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7] 27; SSE2-NEXT: packuswb %xmm0, %xmm0 28; SSE2-NEXT: paddb (%rdx), %xmm0 29; SSE2-NEXT: movdqa %xmm0, (%rcx) 30; SSE2-NEXT: retq 31; 32; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 33; SSE42: # %bb.0: 34; SSE42-NEXT: movdqa (%rdi), %xmm0 35; SSE42-NEXT: paddb (%rsi), %xmm0 36; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 37; SSE42-NEXT: paddb (%rdx), %xmm0 38; SSE42-NEXT: movdqa %xmm0, (%rcx) 39; SSE42-NEXT: retq 40; 41; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 42; AVX: # %bb.0: 43; AVX-NEXT: vmovdqa (%rdi), %xmm0 44; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 45; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 46; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 47; AVX-NEXT: vmovdqa %xmm0, (%rcx) 48; AVX-NEXT: retq 49; 50; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 51; AVX2: # %bb.0: 52; AVX2-NEXT: vmovdqa (%rdi), %xmm0 53; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 54; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 55; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 56; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 57; AVX2-NEXT: vzeroupper 58; AVX2-NEXT: retq 59; 60; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 61; AVX512F: # %bb.0: 62; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 63; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 64; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 65; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 66; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 67; AVX512F-NEXT: vzeroupper 68; AVX512F-NEXT: retq 69; 70; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 71; AVX512DQ: # %bb.0: 72; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 73; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 74; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 75; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 76; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 77; AVX512DQ-NEXT: vzeroupper 78; AVX512DQ-NEXT: retq 79; 80; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2: 81; AVX512BW: # %bb.0: 82; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 83; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 84; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 85; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 86; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 87; AVX512BW-NEXT: vzeroupper 88; AVX512BW-NEXT: retq 89 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 90 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 91 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 92 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 93 %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.zextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 94 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 95 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 96 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 97 ret void 98} 99 100define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 101; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 102; SSE2: # %bb.0: 103; SSE2-NEXT: movdqa (%rdi), %xmm0 104; SSE2-NEXT: paddb (%rsi), %xmm0 105; SSE2-NEXT: pxor %xmm1, %xmm1 106; SSE2-NEXT: movdqa %xmm0, %xmm2 107; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 108; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 109; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 110; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 111; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 112; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 113; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 114; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 115; SSE2-NEXT: packuswb %xmm2, %xmm2 116; SSE2-NEXT: paddb (%rdx), %xmm2 117; SSE2-NEXT: movdqa %xmm2, (%rcx) 118; SSE2-NEXT: retq 119; 120; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 121; SSE42: # %bb.0: 122; SSE42-NEXT: movdqa (%rdi), %xmm0 123; SSE42-NEXT: paddb (%rsi), %xmm0 124; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 125; SSE42-NEXT: paddb (%rdx), %xmm0 126; SSE42-NEXT: movdqa %xmm0, (%rcx) 127; SSE42-NEXT: retq 128; 129; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 130; AVX: # %bb.0: 131; AVX-NEXT: vmovdqa (%rdi), %xmm0 132; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 133; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 134; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 135; AVX-NEXT: vmovdqa %xmm0, (%rcx) 136; AVX-NEXT: retq 137; 138; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 139; AVX2: # %bb.0: 140; AVX2-NEXT: vmovdqa (%rdi), %xmm0 141; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 142; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 143; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 144; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 145; AVX2-NEXT: vzeroupper 146; AVX2-NEXT: retq 147; 148; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 149; AVX512F: # %bb.0: 150; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 151; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 152; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 153; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 154; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 155; AVX512F-NEXT: vzeroupper 156; AVX512F-NEXT: retq 157; 158; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 159; AVX512DQ: # %bb.0: 160; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 161; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 162; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 163; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 164; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 165; AVX512DQ-NEXT: vzeroupper 166; AVX512DQ-NEXT: retq 167; 168; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4: 169; AVX512BW: # %bb.0: 170; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 171; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 172; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u] 173; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 174; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 175; AVX512BW-NEXT: vzeroupper 176; AVX512BW-NEXT: retq 177 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 178 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 179 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 180 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 181 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 182 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 183 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 184 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 185 ret void 186} 187 188define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 189; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 190; SSE2: # %bb.0: 191; SSE2-NEXT: movdqa (%rdi), %xmm0 192; SSE2-NEXT: paddb (%rsi), %xmm0 193; SSE2-NEXT: pxor %xmm1, %xmm1 194; SSE2-NEXT: movdqa %xmm0, %xmm2 195; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 196; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] 197; SSE2-NEXT: pand %xmm3, %xmm2 198; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 200; SSE2-NEXT: pandn %xmm0, %xmm3 201; SSE2-NEXT: por %xmm2, %xmm3 202; SSE2-NEXT: packuswb %xmm3, %xmm3 203; SSE2-NEXT: paddb (%rdx), %xmm3 204; SSE2-NEXT: movdqa %xmm3, (%rcx) 205; SSE2-NEXT: retq 206; 207; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 208; SSE42: # %bb.0: 209; SSE42-NEXT: movdqa (%rdi), %xmm0 210; SSE42-NEXT: paddb (%rsi), %xmm0 211; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 212; SSE42-NEXT: paddb (%rdx), %xmm0 213; SSE42-NEXT: movdqa %xmm0, (%rcx) 214; SSE42-NEXT: retq 215; 216; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 217; AVX: # %bb.0: 218; AVX-NEXT: vmovdqa (%rdi), %xmm0 219; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 220; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 221; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 222; AVX-NEXT: vmovdqa %xmm0, (%rcx) 223; AVX-NEXT: retq 224; 225; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 226; AVX2: # %bb.0: 227; AVX2-NEXT: vmovdqa (%rdi), %xmm0 228; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 229; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 230; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 231; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 232; AVX2-NEXT: vzeroupper 233; AVX2-NEXT: retq 234; 235; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 236; AVX512F: # %bb.0: 237; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 238; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 239; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 240; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 241; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 242; AVX512F-NEXT: vzeroupper 243; AVX512F-NEXT: retq 244; 245; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 246; AVX512DQ: # %bb.0: 247; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 248; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 249; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 250; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 251; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 252; AVX512DQ-NEXT: vzeroupper 253; AVX512DQ-NEXT: retq 254; 255; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2: 256; AVX512BW: # %bb.0: 257; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 258; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 259; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u] 260; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 261; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 262; AVX512BW-NEXT: vzeroupper 263; AVX512BW-NEXT: retq 264 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 265 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 266 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 267 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 268 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 269 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 270 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 271 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 272 ret void 273} 274 275define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 276; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 277; SSE2: # %bb.0: 278; SSE2-NEXT: movdqa (%rdi), %xmm0 279; SSE2-NEXT: paddb (%rsi), %xmm0 280; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 281; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 282; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7] 283; SSE2-NEXT: paddb (%rdx), %xmm0 284; SSE2-NEXT: movdqa %xmm0, (%rcx) 285; SSE2-NEXT: retq 286; 287; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 288; SSE42: # %bb.0: 289; SSE42-NEXT: movdqa (%rdi), %xmm0 290; SSE42-NEXT: paddb (%rsi), %xmm0 291; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 292; SSE42-NEXT: paddb (%rdx), %xmm0 293; SSE42-NEXT: movdqa %xmm0, (%rcx) 294; SSE42-NEXT: retq 295; 296; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 297; AVX: # %bb.0: 298; AVX-NEXT: vmovdqa (%rdi), %xmm0 299; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 300; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 301; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 302; AVX-NEXT: vmovdqa %xmm0, (%rcx) 303; AVX-NEXT: retq 304; 305; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 306; AVX2: # %bb.0: 307; AVX2-NEXT: vmovdqa (%rdi), %xmm0 308; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 309; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15] 310; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 311; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 312; AVX2-NEXT: vzeroupper 313; AVX2-NEXT: retq 314; 315; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 316; AVX512F: # %bb.0: 317; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 318; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 319; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 320; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 321; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 322; AVX512F-NEXT: vzeroupper 323; AVX512F-NEXT: retq 324; 325; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 326; AVX512DQ: # %bb.0: 327; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 328; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 329; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 330; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 331; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 332; AVX512DQ-NEXT: vzeroupper 333; AVX512DQ-NEXT: retq 334; 335; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: 336; AVX512BW: # %bb.0: 337; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 338; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 339; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] 340; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 341; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 342; AVX512BW-NEXT: vzeroupper 343; AVX512BW-NEXT: retq 344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 347 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 348 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 349 %out.bytevec = bitcast <4 x i16> %broadcast.of.zextinreg to <8 x i8> 350 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 354 ret void 355} 356 357define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 358; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 359; SSE2: # %bb.0: 360; SSE2-NEXT: movdqa (%rdi), %xmm0 361; SSE2-NEXT: movdqa 16(%rdi), %xmm1 362; SSE2-NEXT: paddb (%rsi), %xmm0 363; SSE2-NEXT: paddb 16(%rsi), %xmm1 364; SSE2-NEXT: psrlw $8, %xmm1 365; SSE2-NEXT: packuswb %xmm1, %xmm1 366; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 367; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 368; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 369; SSE2-NEXT: paddb (%rdx), %xmm0 370; SSE2-NEXT: movdqa %xmm0, (%rcx) 371; SSE2-NEXT: retq 372; 373; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 374; SSE42: # %bb.0: 375; SSE42-NEXT: movdqa (%rdi), %xmm0 376; SSE42-NEXT: movdqa 16(%rdi), %xmm1 377; SSE42-NEXT: paddb (%rsi), %xmm0 378; SSE42-NEXT: paddb 16(%rsi), %xmm1 379; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 380; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 381; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 382; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 383; SSE42-NEXT: paddb (%rdx), %xmm0 384; SSE42-NEXT: movdqa %xmm0, (%rcx) 385; SSE42-NEXT: retq 386; 387; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 388; AVX: # %bb.0: 389; AVX-NEXT: vmovdqa (%rdi), %xmm0 390; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 391; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 392; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 393; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 394; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 395; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 396; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 397; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 398; AVX-NEXT: vmovdqa %xmm0, (%rcx) 399; AVX-NEXT: retq 400; 401; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 402; AVX2: # %bb.0: 403; AVX2-NEXT: vmovdqa (%rdi), %ymm0 404; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 405; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 406; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 407; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 408; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 409; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 410; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 411; AVX2-NEXT: vzeroupper 412; AVX2-NEXT: retq 413; 414; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 415; AVX512F: # %bb.0: 416; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 417; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 418; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 419; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 420; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 421; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 422; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 423; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 424; AVX512F-NEXT: vzeroupper 425; AVX512F-NEXT: retq 426; 427; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 428; AVX512DQ: # %bb.0: 429; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 430; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 431; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 432; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 433; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 434; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 435; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 436; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 437; AVX512DQ-NEXT: vzeroupper 438; AVX512DQ-NEXT: retq 439; 440; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8: 441; AVX512BW: # %bb.0: 442; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 443; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 444; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 445; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 446; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 447; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 448; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 449; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 450; AVX512BW-NEXT: vzeroupper 451; AVX512BW-NEXT: retq 452 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 453 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 454 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 455 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 456 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 457 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 458 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 459 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 460 ret void 461} 462 463define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 464; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 465; SSE2: # %bb.0: 466; SSE2-NEXT: movdqa (%rdi), %xmm0 467; SSE2-NEXT: movdqa 16(%rdi), %xmm1 468; SSE2-NEXT: paddb (%rsi), %xmm0 469; SSE2-NEXT: paddb 16(%rsi), %xmm1 470; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 471; SSE2-NEXT: pand %xmm2, %xmm1 472; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 473; SSE2-NEXT: pandn %xmm0, %xmm2 474; SSE2-NEXT: por %xmm1, %xmm2 475; SSE2-NEXT: paddb (%rdx), %xmm2 476; SSE2-NEXT: movdqa %xmm2, (%rcx) 477; SSE2-NEXT: retq 478; 479; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 480; SSE42: # %bb.0: 481; SSE42-NEXT: movdqa (%rdi), %xmm0 482; SSE42-NEXT: movdqa 16(%rdi), %xmm1 483; SSE42-NEXT: paddb 16(%rsi), %xmm1 484; SSE42-NEXT: paddb (%rsi), %xmm0 485; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 486; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 487; SSE42-NEXT: paddb (%rdx), %xmm0 488; SSE42-NEXT: movdqa %xmm0, (%rcx) 489; SSE42-NEXT: retq 490; 491; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 492; AVX: # %bb.0: 493; AVX-NEXT: vmovdqa (%rdi), %xmm0 494; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 495; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 496; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 497; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 498; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 499; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 500; AVX-NEXT: vmovdqa %xmm0, (%rcx) 501; AVX-NEXT: retq 502; 503; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 504; AVX2: # %bb.0: 505; AVX2-NEXT: vmovdqa (%rdi), %ymm0 506; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 507; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 508; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 509; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 510; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 511; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 512; AVX2-NEXT: vzeroupper 513; AVX2-NEXT: retq 514; 515; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 516; AVX512F: # %bb.0: 517; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 518; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 519; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 520; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 521; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 522; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 523; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 524; AVX512F-NEXT: vzeroupper 525; AVX512F-NEXT: retq 526; 527; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 528; AVX512DQ: # %bb.0: 529; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 530; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 531; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 532; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 533; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 534; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 535; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 536; AVX512DQ-NEXT: vzeroupper 537; AVX512DQ-NEXT: retq 538; 539; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4: 540; AVX512BW: # %bb.0: 541; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 542; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 543; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 544; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 545; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 546; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 547; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 548; AVX512BW-NEXT: vzeroupper 549; AVX512BW-NEXT: retq 550 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 551 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 552 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 553 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 554 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 555 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 556 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 557 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 558 ret void 559} 560 561define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 562; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 563; SSE2: # %bb.0: 564; SSE2-NEXT: movdqa (%rdi), %xmm0 565; SSE2-NEXT: movdqa 16(%rdi), %xmm1 566; SSE2-NEXT: paddb (%rsi), %xmm0 567; SSE2-NEXT: paddb 16(%rsi), %xmm1 568; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 569; SSE2-NEXT: pand %xmm2, %xmm1 570; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 571; SSE2-NEXT: pandn %xmm0, %xmm2 572; SSE2-NEXT: por %xmm1, %xmm2 573; SSE2-NEXT: paddb (%rdx), %xmm2 574; SSE2-NEXT: movdqa %xmm2, (%rcx) 575; SSE2-NEXT: retq 576; 577; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 578; SSE42: # %bb.0: 579; SSE42-NEXT: movdqa (%rdi), %xmm0 580; SSE42-NEXT: movdqa 16(%rdi), %xmm1 581; SSE42-NEXT: paddb 16(%rsi), %xmm1 582; SSE42-NEXT: paddb (%rsi), %xmm0 583; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 584; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 585; SSE42-NEXT: paddb (%rdx), %xmm0 586; SSE42-NEXT: movdqa %xmm0, (%rcx) 587; SSE42-NEXT: retq 588; 589; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 590; AVX: # %bb.0: 591; AVX-NEXT: vmovdqa (%rdi), %xmm0 592; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 593; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 594; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 595; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 596; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 597; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 598; AVX-NEXT: vmovdqa %xmm0, (%rcx) 599; AVX-NEXT: retq 600; 601; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 602; AVX2: # %bb.0: 603; AVX2-NEXT: vmovdqa (%rdi), %ymm0 604; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 605; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 606; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 607; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 608; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 609; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 610; AVX2-NEXT: vzeroupper 611; AVX2-NEXT: retq 612; 613; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 614; AVX512F: # %bb.0: 615; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 616; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 617; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 618; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 619; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 620; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 621; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 622; AVX512F-NEXT: vzeroupper 623; AVX512F-NEXT: retq 624; 625; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 626; AVX512DQ: # %bb.0: 627; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 628; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 629; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 630; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 631; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 632; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 633; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 634; AVX512DQ-NEXT: vzeroupper 635; AVX512DQ-NEXT: retq 636; 637; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2: 638; AVX512BW: # %bb.0: 639; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 640; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 641; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 642; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 643; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 644; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 645; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 646; AVX512BW-NEXT: vzeroupper 647; AVX512BW-NEXT: retq 648 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 649 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 650 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 651 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 652 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 653 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 654 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 655 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 656 ret void 657} 658 659define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 660; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 661; SSE2: # %bb.0: 662; SSE2-NEXT: movdqa (%rdi), %xmm0 663; SSE2-NEXT: movdqa 16(%rdi), %xmm1 664; SSE2-NEXT: paddb 16(%rsi), %xmm1 665; SSE2-NEXT: paddb (%rsi), %xmm0 666; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 667; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 668; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 669; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 670; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 671; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 672; SSE2-NEXT: paddb (%rdx), %xmm0 673; SSE2-NEXT: movdqa %xmm0, (%rcx) 674; SSE2-NEXT: retq 675; 676; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 677; SSE42: # %bb.0: 678; SSE42-NEXT: movdqa (%rdi), %xmm0 679; SSE42-NEXT: movdqa 16(%rdi), %xmm1 680; SSE42-NEXT: paddb (%rsi), %xmm0 681; SSE42-NEXT: paddb 16(%rsi), %xmm1 682; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 683; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 684; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 685; SSE42-NEXT: paddb (%rdx), %xmm0 686; SSE42-NEXT: movdqa %xmm0, (%rcx) 687; SSE42-NEXT: retq 688; 689; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 690; AVX: # %bb.0: 691; AVX-NEXT: vmovdqa (%rdi), %xmm0 692; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 693; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 694; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 695; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 696; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 697; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 698; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 699; AVX-NEXT: vmovdqa %xmm0, (%rcx) 700; AVX-NEXT: retq 701; 702; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 703; AVX2: # %bb.0: 704; AVX2-NEXT: vmovdqa (%rdi), %ymm0 705; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 706; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 707; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 708; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 709; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 710; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 711; AVX2-NEXT: vzeroupper 712; AVX2-NEXT: retq 713; 714; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 715; AVX512F: # %bb.0: 716; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 717; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 718; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 719; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 720; AVX512F-NEXT: vmovd %xmm0, %eax 721; AVX512F-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0 722; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 723; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 724; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 725; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 726; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] 727; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 728; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 729; AVX512F-NEXT: vzeroupper 730; AVX512F-NEXT: retq 731; 732; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 733; AVX512DQ: # %bb.0: 734; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 735; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 736; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 737; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 738; AVX512DQ-NEXT: vmovd %xmm0, %eax 739; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0 740; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 741; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 742; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] 743; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 744; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] 745; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 746; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 747; AVX512DQ-NEXT: vzeroupper 748; AVX512DQ-NEXT: retq 749; 750; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 751; AVX512BW-SLOW: # %bb.0: 752; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 753; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] 754; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 755; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 756; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 757; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 758; AVX512BW-SLOW-NEXT: vzeroupper 759; AVX512BW-SLOW-NEXT: retq 760; 761; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: 762; AVX512BW-FAST: # %bb.0: 763; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 764; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] 765; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 766; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 767; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax 768; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 769; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 770; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 771; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 772; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 773; AVX512BW-FAST-NEXT: vzeroupper 774; AVX512BW-FAST-NEXT: retq 775 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 776 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 777 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 778 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 779 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 780 %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8> 781 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 782 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 783 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 784 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 785 ret void 786} 787 788define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 789; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 790; SSE2: # %bb.0: 791; SSE2-NEXT: movdqa (%rdi), %xmm0 792; SSE2-NEXT: movdqa 16(%rdi), %xmm1 793; SSE2-NEXT: paddb (%rsi), %xmm0 794; SSE2-NEXT: paddb 16(%rsi), %xmm1 795; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 796; SSE2-NEXT: pand %xmm2, %xmm1 797; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 798; SSE2-NEXT: pandn %xmm0, %xmm2 799; SSE2-NEXT: por %xmm1, %xmm2 800; SSE2-NEXT: paddb (%rdx), %xmm2 801; SSE2-NEXT: movdqa %xmm2, (%rcx) 802; SSE2-NEXT: retq 803; 804; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 805; SSE42: # %bb.0: 806; SSE42-NEXT: movdqa (%rdi), %xmm0 807; SSE42-NEXT: movdqa 16(%rdi), %xmm1 808; SSE42-NEXT: paddb 16(%rsi), %xmm1 809; SSE42-NEXT: paddb (%rsi), %xmm0 810; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 811; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 812; SSE42-NEXT: paddb (%rdx), %xmm0 813; SSE42-NEXT: movdqa %xmm0, (%rcx) 814; SSE42-NEXT: retq 815; 816; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 817; AVX: # %bb.0: 818; AVX-NEXT: vmovdqa (%rdi), %xmm0 819; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 820; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 821; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 822; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 823; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 824; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 825; AVX-NEXT: vmovdqa %xmm0, (%rcx) 826; AVX-NEXT: retq 827; 828; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 829; AVX2: # %bb.0: 830; AVX2-NEXT: vmovdqa (%rdi), %ymm0 831; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 832; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 833; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 834; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 835; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 836; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 837; AVX2-NEXT: vzeroupper 838; AVX2-NEXT: retq 839; 840; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 841; AVX512F: # %bb.0: 842; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 843; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 844; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 845; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] 846; AVX512F-NEXT: vmovd %xmm0, %eax 847; AVX512F-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0 848; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 849; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 850; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 851; AVX512F-NEXT: vzeroupper 852; AVX512F-NEXT: retq 853; 854; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 855; AVX512DQ: # %bb.0: 856; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 857; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 858; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 859; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] 860; AVX512DQ-NEXT: vmovd %xmm0, %eax 861; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0 862; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 863; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 864; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 865; AVX512DQ-NEXT: vzeroupper 866; AVX512DQ-NEXT: retq 867; 868; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 869; AVX512BW-SLOW: # %bb.0: 870; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 871; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] 872; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 873; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 874; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 875; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 876; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 877; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 878; AVX512BW-SLOW-NEXT: vzeroupper 879; AVX512BW-SLOW-NEXT: retq 880; 881; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: 882; AVX512BW-FAST: # %bb.0: 883; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 884; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] 885; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 886; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 887; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 888; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] 889; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 890; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 891; AVX512BW-FAST-NEXT: vzeroupper 892; AVX512BW-FAST-NEXT: retq 893 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 894 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 895 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 896 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 897 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 898 %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8> 899 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 900 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 901 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 902 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 903 ret void 904} 905 906define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 907; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 908; SSE2: # %bb.0: 909; SSE2-NEXT: movdqa (%rdi), %xmm0 910; SSE2-NEXT: movdqa 16(%rdi), %xmm1 911; SSE2-NEXT: paddb (%rsi), %xmm0 912; SSE2-NEXT: paddb 16(%rsi), %xmm1 913; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 914; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 915; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 916; SSE2-NEXT: paddb (%rdx), %xmm0 917; SSE2-NEXT: movdqa %xmm0, (%rcx) 918; SSE2-NEXT: retq 919; 920; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 921; SSE42: # %bb.0: 922; SSE42-NEXT: movdqa (%rdi), %xmm0 923; SSE42-NEXT: movdqa 16(%rdi), %xmm1 924; SSE42-NEXT: paddb 16(%rsi), %xmm1 925; SSE42-NEXT: paddb (%rsi), %xmm0 926; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 927; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 928; SSE42-NEXT: paddb (%rdx), %xmm0 929; SSE42-NEXT: movdqa %xmm0, (%rcx) 930; SSE42-NEXT: retq 931; 932; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 933; AVX: # %bb.0: 934; AVX-NEXT: vmovdqa (%rdi), %xmm0 935; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 936; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 937; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 938; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 939; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 940; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 941; AVX-NEXT: vmovdqa %xmm0, (%rcx) 942; AVX-NEXT: retq 943; 944; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 945; AVX2-SLOW: # %bb.0: 946; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 947; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 948; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 949; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 950; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 951; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 952; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 953; AVX2-SLOW-NEXT: vzeroupper 954; AVX2-SLOW-NEXT: retq 955; 956; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 957; AVX2-FAST-PERLANE: # %bb.0: 958; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 959; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 960; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 961; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0 962; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 963; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 964; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) 965; AVX2-FAST-PERLANE-NEXT: vzeroupper 966; AVX2-FAST-PERLANE-NEXT: retq 967; 968; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 969; AVX2-FAST: # %bb.0: 970; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 971; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] 972; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 973; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 974; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 975; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) 976; AVX2-FAST-NEXT: vzeroupper 977; AVX2-FAST-NEXT: retq 978; 979; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 980; AVX512F: # %bb.0: 981; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] 982; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 983; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 984; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 985; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 986; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 987; AVX512F-NEXT: vzeroupper 988; AVX512F-NEXT: retq 989; 990; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 991; AVX512DQ: # %bb.0: 992; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] 993; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 994; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 995; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 996; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 997; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 998; AVX512DQ-NEXT: vzeroupper 999; AVX512DQ-NEXT: retq 1000; 1001; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: 1002; AVX512BW: # %bb.0: 1003; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1004; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] 1005; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1006; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 1007; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1008; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1009; AVX512BW-NEXT: vzeroupper 1010; AVX512BW-NEXT: retq 1011 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1012 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1013 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1014 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 1015 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 1016 %out.bytevec = bitcast <4 x i32> %broadcast.of.zextinreg to <16 x i8> 1017 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1018 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1019 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1020 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1021 ret void 1022} 1023 1024define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1025; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1026; SSE2: # %bb.0: 1027; SSE2-NEXT: movdqa (%rdi), %xmm0 1028; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1029; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1030; SSE2-NEXT: paddb 48(%rsi), %xmm2 1031; SSE2-NEXT: paddb (%rsi), %xmm0 1032; SSE2-NEXT: paddb 32(%rsi), %xmm1 1033; SSE2-NEXT: psrlw $8, %xmm1 1034; SSE2-NEXT: packuswb %xmm1, %xmm1 1035; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1036; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1037; SSE2-NEXT: movdqa %xmm0, %xmm3 1038; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1039; SSE2-NEXT: psrlw $8, %xmm2 1040; SSE2-NEXT: packuswb %xmm2, %xmm2 1041; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1042; SSE2-NEXT: paddb 16(%rdx), %xmm0 1043; SSE2-NEXT: paddb (%rdx), %xmm3 1044; SSE2-NEXT: movdqa %xmm3, (%rcx) 1045; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1046; SSE2-NEXT: retq 1047; 1048; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1049; SSE42: # %bb.0: 1050; SSE42-NEXT: movdqa (%rdi), %xmm0 1051; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1052; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1053; SSE42-NEXT: paddb 48(%rsi), %xmm2 1054; SSE42-NEXT: paddb (%rsi), %xmm0 1055; SSE42-NEXT: paddb 32(%rsi), %xmm1 1056; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 1057; SSE42-NEXT: pshufb %xmm3, %xmm1 1058; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1059; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1060; SSE42-NEXT: movdqa %xmm0, %xmm4 1061; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 1062; SSE42-NEXT: pshufb %xmm3, %xmm2 1063; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1064; SSE42-NEXT: paddb 16(%rdx), %xmm0 1065; SSE42-NEXT: paddb (%rdx), %xmm4 1066; SSE42-NEXT: movdqa %xmm4, (%rcx) 1067; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1068; SSE42-NEXT: retq 1069; 1070; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1071; AVX: # %bb.0: 1072; AVX-NEXT: vmovdqa (%rdi), %xmm0 1073; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1074; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1075; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1076; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1077; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1078; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 1079; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1080; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1081; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1082; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1083; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1084; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1085; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1086; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1087; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1088; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1089; AVX-NEXT: retq 1090; 1091; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1092; AVX2: # %bb.0: 1093; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1094; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1095; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1096; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1097; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1098; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 1099; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1100; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1101; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1102; AVX2-NEXT: vzeroupper 1103; AVX2-NEXT: retq 1104; 1105; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1106; AVX512F: # %bb.0: 1107; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1108; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1109; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1110; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1111; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1112; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 1113; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1114; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1115; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1116; AVX512F-NEXT: vzeroupper 1117; AVX512F-NEXT: retq 1118; 1119; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1120; AVX512DQ: # %bb.0: 1121; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1122; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1123; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1124; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1125; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1126; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 1127; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 1128; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1129; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1130; AVX512DQ-NEXT: vzeroupper 1131; AVX512DQ-NEXT: retq 1132; 1133; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: 1134; AVX512BW: # %bb.0: 1135; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1136; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1137; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1138; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 1139; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 1140; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1141; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1142; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1143; AVX512BW-NEXT: vzeroupper 1144; AVX512BW-NEXT: retq 1145 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1146 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1147 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1148 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63> 1149 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1150 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1151 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1152 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1153 ret void 1154} 1155 1156define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1157; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1158; SSE2: # %bb.0: 1159; SSE2-NEXT: movdqa (%rdi), %xmm0 1160; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1161; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1162; SSE2-NEXT: paddb 48(%rsi), %xmm2 1163; SSE2-NEXT: paddb (%rsi), %xmm0 1164; SSE2-NEXT: paddb 32(%rsi), %xmm1 1165; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 1166; SSE2-NEXT: pand %xmm3, %xmm1 1167; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1168; SSE2-NEXT: pand %xmm3, %xmm2 1169; SSE2-NEXT: pandn %xmm0, %xmm3 1170; SSE2-NEXT: por %xmm3, %xmm1 1171; SSE2-NEXT: por %xmm2, %xmm3 1172; SSE2-NEXT: paddb 16(%rdx), %xmm3 1173; SSE2-NEXT: paddb (%rdx), %xmm1 1174; SSE2-NEXT: movdqa %xmm1, (%rcx) 1175; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1176; SSE2-NEXT: retq 1177; 1178; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1179; SSE42: # %bb.0: 1180; SSE42-NEXT: movdqa (%rdi), %xmm0 1181; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1182; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1183; SSE42-NEXT: paddb 48(%rsi), %xmm2 1184; SSE42-NEXT: paddb 32(%rsi), %xmm1 1185; SSE42-NEXT: paddb (%rsi), %xmm0 1186; SSE42-NEXT: movdqa %xmm0, %xmm3 1187; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] 1188; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 1189; SSE42-NEXT: pshufb %xmm1, %xmm3 1190; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1191; SSE42-NEXT: pshufb %xmm1, %xmm0 1192; SSE42-NEXT: paddb 16(%rdx), %xmm0 1193; SSE42-NEXT: paddb (%rdx), %xmm3 1194; SSE42-NEXT: movdqa %xmm3, (%rcx) 1195; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1196; SSE42-NEXT: retq 1197; 1198; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vmovdqa (%rdi), %xmm0 1201; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1202; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1203; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1204; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1205; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1206; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1207; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] 1208; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1209; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1210; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1211; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1212; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1213; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1214; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1215; AVX-NEXT: retq 1216; 1217; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1218; AVX2: # %bb.0: 1219; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1220; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1221; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1222; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1223; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 1224; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 1225; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1226; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1227; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1228; AVX2-NEXT: vzeroupper 1229; AVX2-NEXT: retq 1230; 1231; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1232; AVX512F: # %bb.0: 1233; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1234; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1235; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1236; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1237; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 1238; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1239; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1240; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1241; AVX512F-NEXT: vzeroupper 1242; AVX512F-NEXT: retq 1243; 1244; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1245; AVX512DQ: # %bb.0: 1246; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1247; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1248; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1249; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1250; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1 1251; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1252; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1253; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1254; AVX512DQ-NEXT: vzeroupper 1255; AVX512DQ-NEXT: retq 1256; 1257; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: 1258; AVX512BW: # %bb.0: 1259; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1260; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1261; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1262; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 1263; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 1264; AVX512BW-NEXT: kmovd %eax, %k1 1265; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1266; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1267; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1268; AVX512BW-NEXT: vzeroupper 1269; AVX512BW-NEXT: retq 1270 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1271 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1272 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1273 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63> 1274 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1275 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1276 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1277 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1278 ret void 1279} 1280 1281define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1282; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1283; SSE2: # %bb.0: 1284; SSE2-NEXT: movdqa (%rdi), %xmm0 1285; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1286; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1287; SSE2-NEXT: paddb 48(%rsi), %xmm2 1288; SSE2-NEXT: paddb (%rsi), %xmm0 1289; SSE2-NEXT: paddb 32(%rsi), %xmm1 1290; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 1291; SSE2-NEXT: pand %xmm3, %xmm1 1292; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1293; SSE2-NEXT: pand %xmm3, %xmm2 1294; SSE2-NEXT: pandn %xmm0, %xmm3 1295; SSE2-NEXT: por %xmm3, %xmm1 1296; SSE2-NEXT: por %xmm2, %xmm3 1297; SSE2-NEXT: paddb 16(%rdx), %xmm3 1298; SSE2-NEXT: paddb (%rdx), %xmm1 1299; SSE2-NEXT: movdqa %xmm1, (%rcx) 1300; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1301; SSE2-NEXT: retq 1302; 1303; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1304; SSE42: # %bb.0: 1305; SSE42-NEXT: movdqa (%rdi), %xmm0 1306; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1307; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1308; SSE42-NEXT: paddb 48(%rsi), %xmm2 1309; SSE42-NEXT: paddb 32(%rsi), %xmm1 1310; SSE42-NEXT: paddb (%rsi), %xmm0 1311; SSE42-NEXT: movdqa %xmm0, %xmm3 1312; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] 1313; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 1314; SSE42-NEXT: pshufb %xmm1, %xmm3 1315; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1316; SSE42-NEXT: pshufb %xmm1, %xmm0 1317; SSE42-NEXT: paddb 16(%rdx), %xmm0 1318; SSE42-NEXT: paddb (%rdx), %xmm3 1319; SSE42-NEXT: movdqa %xmm3, (%rcx) 1320; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1321; SSE42-NEXT: retq 1322; 1323; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1324; AVX: # %bb.0: 1325; AVX-NEXT: vmovdqa (%rdi), %xmm0 1326; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1327; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1328; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1329; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1330; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1331; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1332; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] 1333; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1334; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1335; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1336; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1337; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1338; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1339; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1340; AVX-NEXT: retq 1341; 1342; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1343; AVX2: # %bb.0: 1344; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1345; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1346; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1347; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1348; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1349; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 1350; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1351; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1352; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1353; AVX2-NEXT: vzeroupper 1354; AVX2-NEXT: retq 1355; 1356; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1357; AVX512F: # %bb.0: 1358; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1359; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1360; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1361; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1362; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 1363; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1364; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1365; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1366; AVX512F-NEXT: vzeroupper 1367; AVX512F-NEXT: retq 1368; 1369; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1370; AVX512DQ: # %bb.0: 1371; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1372; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1373; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1374; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1375; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 1376; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 1377; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 1378; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1379; AVX512DQ-NEXT: vzeroupper 1380; AVX512DQ-NEXT: retq 1381; 1382; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: 1383; AVX512BW: # %bb.0: 1384; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1385; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1386; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1387; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0 1388; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 1389; AVX512BW-NEXT: kmovd %eax, %k1 1390; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1391; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1392; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1393; AVX512BW-NEXT: vzeroupper 1394; AVX512BW-NEXT: retq 1395 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1396 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1397 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1398 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1399 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1400 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1401 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1402 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1403 ret void 1404} 1405 1406define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1407; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1408; SSE2: # %bb.0: 1409; SSE2-NEXT: movdqa (%rdi), %xmm0 1410; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1411; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1412; SSE2-NEXT: paddb 48(%rsi), %xmm2 1413; SSE2-NEXT: paddb 32(%rsi), %xmm1 1414; SSE2-NEXT: paddb (%rsi), %xmm0 1415; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1416; SSE2-NEXT: pand %xmm3, %xmm1 1417; SSE2-NEXT: pand %xmm3, %xmm2 1418; SSE2-NEXT: pandn %xmm0, %xmm3 1419; SSE2-NEXT: por %xmm3, %xmm1 1420; SSE2-NEXT: por %xmm3, %xmm2 1421; SSE2-NEXT: paddb 16(%rdx), %xmm2 1422; SSE2-NEXT: paddb (%rdx), %xmm1 1423; SSE2-NEXT: movdqa %xmm1, (%rcx) 1424; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 1425; SSE2-NEXT: retq 1426; 1427; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1428; SSE42: # %bb.0: 1429; SSE42-NEXT: movdqa (%rdi), %xmm1 1430; SSE42-NEXT: movdqa 32(%rdi), %xmm2 1431; SSE42-NEXT: movdqa 48(%rdi), %xmm3 1432; SSE42-NEXT: paddb 48(%rsi), %xmm3 1433; SSE42-NEXT: paddb 32(%rsi), %xmm2 1434; SSE42-NEXT: paddb (%rsi), %xmm1 1435; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1436; SSE42-NEXT: movdqa %xmm1, %xmm4 1437; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 1438; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1439; SSE42-NEXT: paddb 16(%rdx), %xmm1 1440; SSE42-NEXT: paddb (%rdx), %xmm4 1441; SSE42-NEXT: movdqa %xmm4, (%rcx) 1442; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 1443; SSE42-NEXT: retq 1444; 1445; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1446; AVX: # %bb.0: 1447; AVX-NEXT: vmovdqa (%rdi), %xmm0 1448; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1449; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1450; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1451; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1452; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1453; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] 1454; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 1455; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 1456; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1457; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1458; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1459; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1460; AVX-NEXT: retq 1461; 1462; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1463; AVX2: # %bb.0: 1464; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1465; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1466; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1467; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1468; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1469; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] 1470; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 1471; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1472; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1473; AVX2-NEXT: vzeroupper 1474; AVX2-NEXT: retq 1475; 1476; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1477; AVX512F: # %bb.0: 1478; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1479; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1480; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1481; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1482; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1483; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1484; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 1485; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) 1486; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1487; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1488; AVX512F-NEXT: vzeroupper 1489; AVX512F-NEXT: retq 1490; 1491; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1492; AVX512DQ: # %bb.0: 1493; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1494; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1495; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1496; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1497; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1498; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1499; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 1500; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) 1501; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1502; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1503; AVX512DQ-NEXT: vzeroupper 1504; AVX512DQ-NEXT: retq 1505; 1506; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: 1507; AVX512BW: # %bb.0: 1508; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1509; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1510; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1511; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1512; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 1513; AVX512BW-NEXT: kmovd %eax, %k1 1514; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 1515; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 1516; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1517; AVX512BW-NEXT: vzeroupper 1518; AVX512BW-NEXT: retq 1519 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1520 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1521 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1522 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1523 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1524 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1525 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1526 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1527 ret void 1528} 1529 1530define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1531; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1532; SSE2: # %bb.0: 1533; SSE2-NEXT: movdqa (%rdi), %xmm0 1534; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1535; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1536; SSE2-NEXT: paddb 48(%rsi), %xmm2 1537; SSE2-NEXT: paddb 32(%rsi), %xmm1 1538; SSE2-NEXT: paddb (%rsi), %xmm0 1539; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1540; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 1541; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 1542; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1543; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1544; SSE2-NEXT: movdqa %xmm0, %xmm3 1545; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1546; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] 1547; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 1548; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1549; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1550; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1551; SSE2-NEXT: paddb 16(%rdx), %xmm0 1552; SSE2-NEXT: paddb (%rdx), %xmm3 1553; SSE2-NEXT: movdqa %xmm3, (%rcx) 1554; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1555; SSE2-NEXT: retq 1556; 1557; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1558; SSE42: # %bb.0: 1559; SSE42-NEXT: movdqa (%rdi), %xmm0 1560; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1561; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1562; SSE42-NEXT: paddb 48(%rsi), %xmm2 1563; SSE42-NEXT: paddb (%rsi), %xmm0 1564; SSE42-NEXT: paddb 32(%rsi), %xmm1 1565; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 1566; SSE42-NEXT: pshufb %xmm3, %xmm1 1567; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1568; SSE42-NEXT: movdqa %xmm0, %xmm4 1569; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 1570; SSE42-NEXT: pshufb %xmm3, %xmm2 1571; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1572; SSE42-NEXT: paddb 16(%rdx), %xmm0 1573; SSE42-NEXT: paddb (%rdx), %xmm4 1574; SSE42-NEXT: movdqa %xmm4, (%rcx) 1575; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1576; SSE42-NEXT: retq 1577; 1578; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1579; AVX: # %bb.0: 1580; AVX-NEXT: vmovdqa (%rdi), %xmm0 1581; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1582; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1583; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1584; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1585; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1586; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 1587; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1588; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1589; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1590; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1591; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1592; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1593; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1594; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1595; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1596; AVX-NEXT: retq 1597; 1598; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1599; AVX2: # %bb.0: 1600; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1601; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1602; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1603; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1604; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 1605; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1606; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1607; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1608; AVX2-NEXT: vzeroupper 1609; AVX2-NEXT: retq 1610; 1611; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1612; AVX512F: # %bb.0: 1613; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1614; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1615; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1616; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1617; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 1618; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1619; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1620; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1621; AVX512F-NEXT: vzeroupper 1622; AVX512F-NEXT: retq 1623; 1624; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1625; AVX512DQ: # %bb.0: 1626; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1627; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1628; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1629; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1630; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 1631; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1632; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1633; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1634; AVX512DQ-NEXT: vzeroupper 1635; AVX512DQ-NEXT: retq 1636; 1637; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: 1638; AVX512BW: # %bb.0: 1639; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1640; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1641; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] 1642; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 1643; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 1644; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1645; AVX512BW-NEXT: vzeroupper 1646; AVX512BW-NEXT: retq 1647 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1648 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1649 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1650 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1651 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 1652 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8> 1653 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1654 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1655 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1656 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1657 ret void 1658} 1659 1660define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1661; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1662; SSE2: # %bb.0: 1663; SSE2-NEXT: movdqa (%rdi), %xmm0 1664; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1665; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1666; SSE2-NEXT: paddb 48(%rsi), %xmm2 1667; SSE2-NEXT: paddb (%rsi), %xmm0 1668; SSE2-NEXT: paddb 32(%rsi), %xmm1 1669; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] 1670; SSE2-NEXT: pand %xmm3, %xmm1 1671; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1672; SSE2-NEXT: pand %xmm3, %xmm2 1673; SSE2-NEXT: pandn %xmm0, %xmm3 1674; SSE2-NEXT: por %xmm3, %xmm1 1675; SSE2-NEXT: por %xmm2, %xmm3 1676; SSE2-NEXT: paddb 16(%rdx), %xmm3 1677; SSE2-NEXT: paddb (%rdx), %xmm1 1678; SSE2-NEXT: movdqa %xmm1, (%rcx) 1679; SSE2-NEXT: movdqa %xmm3, 16(%rcx) 1680; SSE2-NEXT: retq 1681; 1682; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1683; SSE42: # %bb.0: 1684; SSE42-NEXT: movdqa (%rdi), %xmm0 1685; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1686; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1687; SSE42-NEXT: paddb 48(%rsi), %xmm2 1688; SSE42-NEXT: paddb 32(%rsi), %xmm1 1689; SSE42-NEXT: paddb (%rsi), %xmm0 1690; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1691; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 1692; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 1693; SSE42-NEXT: paddb 16(%rdx), %xmm2 1694; SSE42-NEXT: paddb (%rdx), %xmm1 1695; SSE42-NEXT: movdqa %xmm1, (%rcx) 1696; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 1697; SSE42-NEXT: retq 1698; 1699; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1700; AVX: # %bb.0: 1701; AVX-NEXT: vmovdqa (%rdi), %xmm0 1702; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1703; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1704; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1705; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1706; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1707; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1708; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 1709; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 1710; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1711; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1712; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1713; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1714; AVX-NEXT: retq 1715; 1716; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1717; AVX2: # %bb.0: 1718; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1719; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1720; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1721; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1722; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1723; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1724; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1725; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1726; AVX2-NEXT: vzeroupper 1727; AVX2-NEXT: retq 1728; 1729; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1730; AVX512F: # %bb.0: 1731; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 1732; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1733; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 1734; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1735; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 1736; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1737; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1738; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1739; AVX512F-NEXT: vzeroupper 1740; AVX512F-NEXT: retq 1741; 1742; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1743; AVX512DQ: # %bb.0: 1744; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 1745; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1746; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1747; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1748; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 1749; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] 1750; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1751; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1752; AVX512DQ-NEXT: vzeroupper 1753; AVX512DQ-NEXT: retq 1754; 1755; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: 1756; AVX512BW: # %bb.0: 1757; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1758; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1759; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1760; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] 1761; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 1762; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 1763; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1764; AVX512BW-NEXT: vzeroupper 1765; AVX512BW-NEXT: retq 1766 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1767 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1768 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1769 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1770 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 1771 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8> 1772 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1773 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1774 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1775 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1776 ret void 1777} 1778 1779define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1780; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1781; SSE2: # %bb.0: 1782; SSE2-NEXT: movdqa (%rdi), %xmm0 1783; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1784; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1785; SSE2-NEXT: paddb 48(%rsi), %xmm2 1786; SSE2-NEXT: paddb 32(%rsi), %xmm1 1787; SSE2-NEXT: paddb (%rsi), %xmm0 1788; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] 1789; SSE2-NEXT: pand %xmm3, %xmm1 1790; SSE2-NEXT: pand %xmm3, %xmm2 1791; SSE2-NEXT: pandn %xmm0, %xmm3 1792; SSE2-NEXT: por %xmm3, %xmm1 1793; SSE2-NEXT: por %xmm3, %xmm2 1794; SSE2-NEXT: paddb 16(%rdx), %xmm2 1795; SSE2-NEXT: paddb (%rdx), %xmm1 1796; SSE2-NEXT: movdqa %xmm1, (%rcx) 1797; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 1798; SSE2-NEXT: retq 1799; 1800; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1801; SSE42: # %bb.0: 1802; SSE42-NEXT: movdqa (%rdi), %xmm0 1803; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1804; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1805; SSE42-NEXT: paddb 48(%rsi), %xmm2 1806; SSE42-NEXT: paddb (%rsi), %xmm0 1807; SSE42-NEXT: paddb 32(%rsi), %xmm1 1808; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1809; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 1810; SSE42-NEXT: paddb 16(%rdx), %xmm0 1811; SSE42-NEXT: paddb (%rdx), %xmm1 1812; SSE42-NEXT: movdqa %xmm1, (%rcx) 1813; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 1814; SSE42-NEXT: retq 1815; 1816; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1817; AVX: # %bb.0: 1818; AVX-NEXT: vmovdqa (%rdi), %xmm0 1819; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1820; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1821; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1822; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1823; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1824; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1825; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 1826; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 1827; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 1828; AVX-NEXT: vmovdqa %xmm1, (%rcx) 1829; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1830; AVX-NEXT: retq 1831; 1832; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1833; AVX2: # %bb.0: 1834; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1835; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1836; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1837; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1838; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1839; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1840; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1841; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1842; AVX2-NEXT: vzeroupper 1843; AVX2-NEXT: retq 1844; 1845; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1846; AVX512F: # %bb.0: 1847; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1848; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1849; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1850; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1851; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1852; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1853; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1854; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 1855; AVX512F-NEXT: vzeroupper 1856; AVX512F-NEXT: retq 1857; 1858; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1859; AVX512DQ: # %bb.0: 1860; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1861; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1862; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 1863; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 1864; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 1865; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 1866; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1867; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 1868; AVX512DQ-NEXT: vzeroupper 1869; AVX512DQ-NEXT: retq 1870; 1871; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: 1872; AVX512BW: # %bb.0: 1873; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1874; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 1875; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1876; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] 1877; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 1878; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 1879; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1880; AVX512BW-NEXT: vzeroupper 1881; AVX512BW-NEXT: retq 1882 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 1883 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 1884 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 1885 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 1886 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1887 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8> 1888 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1889 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 1890 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 1891 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 1892 ret void 1893} 1894 1895define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 1896; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1897; SSE2: # %bb.0: 1898; SSE2-NEXT: movdqa (%rdi), %xmm0 1899; SSE2-NEXT: movdqa 32(%rdi), %xmm1 1900; SSE2-NEXT: movdqa 48(%rdi), %xmm2 1901; SSE2-NEXT: paddb 48(%rsi), %xmm2 1902; SSE2-NEXT: paddb (%rsi), %xmm0 1903; SSE2-NEXT: paddb 32(%rsi), %xmm1 1904; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1905; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 1906; SSE2-NEXT: movdqa %xmm0, %xmm3 1907; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1908; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 1909; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1910; SSE2-NEXT: paddb 16(%rdx), %xmm0 1911; SSE2-NEXT: paddb (%rdx), %xmm3 1912; SSE2-NEXT: movdqa %xmm3, (%rcx) 1913; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 1914; SSE2-NEXT: retq 1915; 1916; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1917; SSE42: # %bb.0: 1918; SSE42-NEXT: movdqa (%rdi), %xmm0 1919; SSE42-NEXT: movdqa 32(%rdi), %xmm1 1920; SSE42-NEXT: movdqa 48(%rdi), %xmm2 1921; SSE42-NEXT: paddb 48(%rsi), %xmm2 1922; SSE42-NEXT: paddb 32(%rsi), %xmm1 1923; SSE42-NEXT: paddb (%rsi), %xmm0 1924; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1925; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1926; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1927; SSE42-NEXT: paddb 16(%rdx), %xmm2 1928; SSE42-NEXT: paddb (%rdx), %xmm1 1929; SSE42-NEXT: movdqa %xmm1, (%rcx) 1930; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 1931; SSE42-NEXT: retq 1932; 1933; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1934; AVX: # %bb.0: 1935; AVX-NEXT: vmovdqa (%rdi), %xmm0 1936; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1937; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1938; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 1939; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 1940; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1941; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 1942; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1943; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] 1944; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1945; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1946; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 1947; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 1948; AVX-NEXT: vmovdqa %xmm0, (%rcx) 1949; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 1950; AVX-NEXT: vzeroupper 1951; AVX-NEXT: retq 1952; 1953; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1954; AVX2: # %bb.0: 1955; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 1956; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1957; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1958; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1959; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 1960; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 1961; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1962; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 1963; AVX2-NEXT: vzeroupper 1964; AVX2-NEXT: retq 1965; 1966; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1967; AVX512F-SLOW: # %bb.0: 1968; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 1969; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1970; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 1971; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1972; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 1973; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 1974; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 1975; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 1976; AVX512F-SLOW-NEXT: vzeroupper 1977; AVX512F-SLOW-NEXT: retq 1978; 1979; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1980; AVX512F-FAST: # %bb.0: 1981; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 1982; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1983; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 1984; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1985; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] 1986; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 1987; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 1988; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 1989; AVX512F-FAST-NEXT: vzeroupper 1990; AVX512F-FAST-NEXT: retq 1991; 1992; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 1993; AVX512DQ-SLOW: # %bb.0: 1994; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 1995; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 1996; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 1997; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 1998; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 1999; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 2000; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2001; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2002; AVX512DQ-SLOW-NEXT: vzeroupper 2003; AVX512DQ-SLOW-NEXT: retq 2004; 2005; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2006; AVX512DQ-FAST: # %bb.0: 2007; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 2008; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2009; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 2010; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2011; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] 2012; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2013; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2014; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2015; AVX512DQ-FAST-NEXT: vzeroupper 2016; AVX512DQ-FAST-NEXT: retq 2017; 2018; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2019; AVX512BW-SLOW: # %bb.0: 2020; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2021; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2022; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2023; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 2024; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2025; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2026; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2027; AVX512BW-SLOW-NEXT: vzeroupper 2028; AVX512BW-SLOW-NEXT: retq 2029; 2030; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: 2031; AVX512BW-FAST: # %bb.0: 2032; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2033; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] 2034; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2035; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 2036; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2037; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2038; AVX512BW-FAST-NEXT: vzeroupper 2039; AVX512BW-FAST-NEXT: retq 2040 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2041 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2042 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2043 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 2044 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 2045 %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8> 2046 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2047 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2048 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2049 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2050 ret void 2051} 2052 2053define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2054; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2055; SSE2: # %bb.0: 2056; SSE2-NEXT: movdqa (%rdi), %xmm0 2057; SSE2-NEXT: movdqa 32(%rdi), %xmm1 2058; SSE2-NEXT: movdqa 48(%rdi), %xmm2 2059; SSE2-NEXT: paddb 48(%rsi), %xmm2 2060; SSE2-NEXT: paddb (%rsi), %xmm0 2061; SSE2-NEXT: paddb 32(%rsi), %xmm1 2062; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2063; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 2064; SSE2-NEXT: paddb 16(%rdx), %xmm2 2065; SSE2-NEXT: paddb (%rdx), %xmm1 2066; SSE2-NEXT: movdqa %xmm1, (%rcx) 2067; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 2068; SSE2-NEXT: retq 2069; 2070; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2071; SSE42: # %bb.0: 2072; SSE42-NEXT: movdqa (%rdi), %xmm0 2073; SSE42-NEXT: movdqa 32(%rdi), %xmm1 2074; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2075; SSE42-NEXT: paddb 48(%rsi), %xmm2 2076; SSE42-NEXT: paddb (%rsi), %xmm0 2077; SSE42-NEXT: paddb 32(%rsi), %xmm1 2078; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 2079; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 2080; SSE42-NEXT: paddb 16(%rdx), %xmm0 2081; SSE42-NEXT: paddb (%rdx), %xmm1 2082; SSE42-NEXT: movdqa %xmm1, (%rcx) 2083; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2084; SSE42-NEXT: retq 2085; 2086; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2087; AVX: # %bb.0: 2088; AVX-NEXT: vmovdqa (%rdi), %xmm0 2089; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 2090; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 2091; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 2092; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 2093; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2094; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2095; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2096; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2097; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2098; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2099; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2100; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2101; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2102; AVX-NEXT: vzeroupper 2103; AVX-NEXT: retq 2104; 2105; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2106; AVX2: # %bb.0: 2107; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2108; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2109; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2110; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2111; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2112; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2113; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2114; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2115; AVX2-NEXT: vzeroupper 2116; AVX2-NEXT: retq 2117; 2118; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2119; AVX512F-SLOW: # %bb.0: 2120; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 2121; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 2122; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2123; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2124; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2125; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2126; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2127; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2128; AVX512F-SLOW-NEXT: vzeroupper 2129; AVX512F-SLOW-NEXT: retq 2130; 2131; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2132; AVX512F-FAST: # %bb.0: 2133; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 2134; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2135; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2136; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2137; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] 2138; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2139; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2140; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2141; AVX512F-FAST-NEXT: vzeroupper 2142; AVX512F-FAST-NEXT: retq 2143; 2144; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2145; AVX512DQ-SLOW: # %bb.0: 2146; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 2147; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 2148; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2149; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2150; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2151; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2152; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2153; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2154; AVX512DQ-SLOW-NEXT: vzeroupper 2155; AVX512DQ-SLOW-NEXT: retq 2156; 2157; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2158; AVX512DQ-FAST: # %bb.0: 2159; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 2160; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2161; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2162; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2163; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] 2164; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 2165; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2166; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2167; AVX512DQ-FAST-NEXT: vzeroupper 2168; AVX512DQ-FAST-NEXT: retq 2169; 2170; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2171; AVX512BW-SLOW: # %bb.0: 2172; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2173; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2174; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2175; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2176; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 2177; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2178; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2179; AVX512BW-SLOW-NEXT: vzeroupper 2180; AVX512BW-SLOW-NEXT: retq 2181; 2182; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: 2183; AVX512BW-FAST: # %bb.0: 2184; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2185; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] 2186; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2187; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 2188; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2189; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2190; AVX512BW-FAST-NEXT: vzeroupper 2191; AVX512BW-FAST-NEXT: retq 2192 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2193 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2194 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2195 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 2196 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 2197 %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8> 2198 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2199 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2200 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2201 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2202 ret void 2203} 2204 2205define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2206; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2207; SSE2: # %bb.0: 2208; SSE2-NEXT: movdqa (%rdi), %xmm0 2209; SSE2-NEXT: movdqa 32(%rdi), %xmm1 2210; SSE2-NEXT: movdqa 48(%rdi), %xmm2 2211; SSE2-NEXT: paddb 48(%rsi), %xmm2 2212; SSE2-NEXT: paddb (%rsi), %xmm0 2213; SSE2-NEXT: paddb 32(%rsi), %xmm1 2214; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2215; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 2216; SSE2-NEXT: paddb 16(%rdx), %xmm0 2217; SSE2-NEXT: paddb (%rdx), %xmm1 2218; SSE2-NEXT: movdqa %xmm1, (%rcx) 2219; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2220; SSE2-NEXT: retq 2221; 2222; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2223; SSE42: # %bb.0: 2224; SSE42-NEXT: movdqa (%rdi), %xmm0 2225; SSE42-NEXT: movdqa 32(%rdi), %xmm1 2226; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2227; SSE42-NEXT: paddb 48(%rsi), %xmm2 2228; SSE42-NEXT: paddb 32(%rsi), %xmm1 2229; SSE42-NEXT: paddb (%rsi), %xmm0 2230; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2231; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 2232; SSE42-NEXT: paddb 16(%rdx), %xmm0 2233; SSE42-NEXT: paddb (%rdx), %xmm1 2234; SSE42-NEXT: movdqa %xmm1, (%rcx) 2235; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2236; SSE42-NEXT: retq 2237; 2238; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2239; AVX: # %bb.0: 2240; AVX-NEXT: vmovdqa (%rdi), %xmm0 2241; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 2242; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 2243; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 2244; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 2245; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2246; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2247; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2248; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 2249; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 2250; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 2251; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 2252; AVX-NEXT: vmovdqa %xmm0, (%rcx) 2253; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 2254; AVX-NEXT: vzeroupper 2255; AVX-NEXT: retq 2256; 2257; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2258; AVX2: # %bb.0: 2259; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 2260; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2261; AVX2-NEXT: vmovdqa (%rdi), %xmm1 2262; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2263; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 2264; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2265; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2266; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2267; AVX2-NEXT: vzeroupper 2268; AVX2-NEXT: retq 2269; 2270; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2271; AVX512F-SLOW: # %bb.0: 2272; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 2273; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2274; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 2275; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2276; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 2277; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2278; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2279; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2280; AVX512F-SLOW-NEXT: vzeroupper 2281; AVX512F-SLOW-NEXT: retq 2282; 2283; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2284; AVX512F-FAST: # %bb.0: 2285; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 2286; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2287; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2288; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2289; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] 2290; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 2291; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2292; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2293; AVX512F-FAST-NEXT: vzeroupper 2294; AVX512F-FAST-NEXT: retq 2295; 2296; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2297; AVX512DQ-SLOW: # %bb.0: 2298; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 2299; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2300; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 2301; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2302; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 2303; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 2304; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2305; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) 2306; AVX512DQ-SLOW-NEXT: vzeroupper 2307; AVX512DQ-SLOW-NEXT: retq 2308; 2309; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2310; AVX512DQ-FAST: # %bb.0: 2311; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 2312; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 2313; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2314; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2315; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] 2316; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 2317; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 2318; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) 2319; AVX512DQ-FAST-NEXT: vzeroupper 2320; AVX512DQ-FAST-NEXT: retq 2321; 2322; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2323; AVX512BW-SLOW: # %bb.0: 2324; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 2325; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2326; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2327; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 2328; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2329; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2330; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 2331; AVX512BW-SLOW-NEXT: vzeroupper 2332; AVX512BW-SLOW-NEXT: retq 2333; 2334; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: 2335; AVX512BW-FAST: # %bb.0: 2336; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 2337; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] 2338; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2339; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 2340; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2341; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 2342; AVX512BW-FAST-NEXT: vzeroupper 2343; AVX512BW-FAST-NEXT: retq 2344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2347 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 2348 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 2349 %out.bytevec = bitcast <4 x i64> %broadcast.of.zextinreg to <32 x i8> 2350 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2354 ret void 2355} 2356 2357define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2358; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2359; SSE2: # %bb.0: 2360; SSE2-NEXT: movdqa (%rdi), %xmm0 2361; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2362; SSE2-NEXT: paddb (%rsi), %xmm0 2363; SSE2-NEXT: paddb 48(%rsi), %xmm1 2364; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2365; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7] 2366; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 2367; SSE2-NEXT: pand %xmm2, %xmm3 2368; SSE2-NEXT: pandn %xmm1, %xmm2 2369; SSE2-NEXT: por %xmm3, %xmm2 2370; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2371; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2372; SSE2-NEXT: pxor %xmm1, %xmm1 2373; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2374; SSE2-NEXT: paddb (%rdx), %xmm2 2375; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2376; SSE2-NEXT: paddb %xmm0, %xmm1 2377; SSE2-NEXT: paddb 32(%rdx), %xmm0 2378; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2379; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2380; SSE2-NEXT: movdqa %xmm2, (%rcx) 2381; SSE2-NEXT: retq 2382; 2383; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2384; SSE42: # %bb.0: 2385; SSE42-NEXT: movdqa (%rdi), %xmm1 2386; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2387; SSE42-NEXT: paddb 48(%rsi), %xmm2 2388; SSE42-NEXT: paddb (%rsi), %xmm1 2389; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] 2390; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 2391; SSE42-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2392; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm2 2393; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero 2394; SSE42-NEXT: paddb (%rdx), %xmm2 2395; SSE42-NEXT: movdqa 16(%rdx), %xmm0 2396; SSE42-NEXT: paddb %xmm1, %xmm0 2397; SSE42-NEXT: paddb 32(%rdx), %xmm1 2398; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2399; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2400; SSE42-NEXT: movdqa %xmm2, (%rcx) 2401; SSE42-NEXT: retq 2402; 2403; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2404; AVX: # %bb.0: 2405; AVX-NEXT: vmovdqa (%rdi), %xmm0 2406; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2407; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2408; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2409; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 2410; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 2411; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2412; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 2413; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero 2414; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2415; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2416; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2417; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2418; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2419; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2420; AVX-NEXT: retq 2421; 2422; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2423; AVX2: # %bb.0: 2424; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2425; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2426; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2427; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2428; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 2429; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero 2430; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 2431; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2432; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 2433; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero 2434; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2435; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2436; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2437; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2438; AVX2-NEXT: vzeroupper 2439; AVX2-NEXT: retq 2440; 2441; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2442; AVX512F: # %bb.0: 2443; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2444; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2445; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2446; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2447; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2448; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 2449; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2450; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2451; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2452; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2453; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2454; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2455; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2456; AVX512F-NEXT: vzeroupper 2457; AVX512F-NEXT: retq 2458; 2459; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2460; AVX512DQ: # %bb.0: 2461; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2462; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2463; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2464; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2465; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2466; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 2467; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2468; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2469; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2470; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2471; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2472; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2473; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2474; AVX512DQ-NEXT: vzeroupper 2475; AVX512DQ-NEXT: retq 2476; 2477; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: 2478; AVX512BW: # %bb.0: 2479; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2480; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2481; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2482; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 2483; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 2484; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2485; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2486; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2487; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2488; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2489; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2490; AVX512BW-NEXT: vzeroupper 2491; AVX512BW-NEXT: retq 2492 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2493 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2494 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2495 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95> 2496 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2497 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2498 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2499 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2500 ret void 2501} 2502 2503define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2504; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2505; SSE2: # %bb.0: 2506; SSE2-NEXT: movdqa (%rdi), %xmm0 2507; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2508; SSE2-NEXT: paddb (%rsi), %xmm0 2509; SSE2-NEXT: paddb 48(%rsi), %xmm1 2510; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 2511; SSE2-NEXT: pand %xmm2, %xmm1 2512; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2513; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2514; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2515; SSE2-NEXT: pandn %xmm0, %xmm2 2516; SSE2-NEXT: por %xmm1, %xmm2 2517; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0] 2518; SSE2-NEXT: pand %xmm0, %xmm1 2519; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2520; SSE2-NEXT: paddb (%rdx), %xmm2 2521; SSE2-NEXT: paddb 16(%rdx), %xmm0 2522; SSE2-NEXT: paddb 32(%rdx), %xmm1 2523; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 2524; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2525; SSE2-NEXT: movdqa %xmm2, (%rcx) 2526; SSE2-NEXT: retq 2527; 2528; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2529; SSE42: # %bb.0: 2530; SSE42-NEXT: movdqa (%rdi), %xmm1 2531; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2532; SSE42-NEXT: paddb 48(%rsi), %xmm2 2533; SSE42-NEXT: paddb (%rsi), %xmm1 2534; SSE42-NEXT: pxor %xmm0, %xmm0 2535; SSE42-NEXT: movdqa %xmm1, %xmm3 2536; SSE42-NEXT: pshufb %xmm0, %xmm3 2537; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 2538; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 2539; SSE42-NEXT: movdqa %xmm1, %xmm0 2540; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero 2541; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero 2542; SSE42-NEXT: paddb (%rdx), %xmm3 2543; SSE42-NEXT: paddb 16(%rdx), %xmm1 2544; SSE42-NEXT: paddb 32(%rdx), %xmm0 2545; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 2546; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 2547; SSE42-NEXT: movdqa %xmm3, (%rcx) 2548; SSE42-NEXT: retq 2549; 2550; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2551; AVX: # %bb.0: 2552; AVX-NEXT: vmovdqa (%rdi), %xmm0 2553; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2554; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2555; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2556; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 2557; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 2558; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 2559; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 2560; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero 2561; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero 2562; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2563; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 2564; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 2565; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 2566; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 2567; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2568; AVX-NEXT: retq 2569; 2570; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2571; AVX2: # %bb.0: 2572; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 2573; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2574; AVX2-NEXT: vmovdqa (%rdi), %xmm1 2575; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2576; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2 2577; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero 2578; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2579; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2580; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 2581; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 2582; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 2583; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero 2584; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2585; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 2586; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 2587; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2588; AVX2-NEXT: vzeroupper 2589; AVX2-NEXT: retq 2590; 2591; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2592; AVX512F: # %bb.0: 2593; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2594; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2595; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2596; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2597; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2598; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2599; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2 2600; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2601; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2602; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero 2603; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2604; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2605; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2606; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2607; AVX512F-NEXT: vzeroupper 2608; AVX512F-NEXT: retq 2609; 2610; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2611; AVX512DQ: # %bb.0: 2612; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2613; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2614; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2615; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2616; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2617; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2618; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2 2619; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2620; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2621; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero 2622; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2623; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2624; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2625; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2626; AVX512DQ-NEXT: vzeroupper 2627; AVX512DQ-NEXT: retq 2628; 2629; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: 2630; AVX512BW: # %bb.0: 2631; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2632; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2633; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2634; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2635; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] 2636; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 2637; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2638; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2639; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero 2640; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 2641; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2642; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2643; AVX512BW-NEXT: vzeroupper 2644; AVX512BW-NEXT: retq 2645 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2646 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2647 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2648 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95> 2649 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2650 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2651 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2652 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2653 ret void 2654} 2655 2656define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2657; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2658; SSE2: # %bb.0: 2659; SSE2-NEXT: movdqa (%rdi), %xmm0 2660; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2661; SSE2-NEXT: paddb (%rsi), %xmm0 2662; SSE2-NEXT: paddb 48(%rsi), %xmm1 2663; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2664; SSE2-NEXT: pand %xmm2, %xmm1 2665; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2666; SSE2-NEXT: pandn %xmm0, %xmm2 2667; SSE2-NEXT: por %xmm1, %xmm2 2668; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2669; SSE2-NEXT: paddb (%rdx), %xmm2 2670; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2671; SSE2-NEXT: paddb %xmm0, %xmm1 2672; SSE2-NEXT: paddb 32(%rdx), %xmm0 2673; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2674; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2675; SSE2-NEXT: movdqa %xmm2, (%rcx) 2676; SSE2-NEXT: retq 2677; 2678; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2679; SSE42: # %bb.0: 2680; SSE42-NEXT: movdqa (%rdi), %xmm1 2681; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2682; SSE42-NEXT: paddb 48(%rsi), %xmm2 2683; SSE42-NEXT: paddb (%rsi), %xmm1 2684; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] 2685; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2686; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 2687; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 2688; SSE42-NEXT: paddb (%rdx), %xmm3 2689; SSE42-NEXT: movdqa 16(%rdx), %xmm0 2690; SSE42-NEXT: paddb %xmm1, %xmm0 2691; SSE42-NEXT: paddb 32(%rdx), %xmm1 2692; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2693; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2694; SSE42-NEXT: movdqa %xmm3, (%rcx) 2695; SSE42-NEXT: retq 2696; 2697; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2698; AVX: # %bb.0: 2699; AVX-NEXT: vmovdqa (%rdi), %xmm0 2700; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2701; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2702; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2703; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 2704; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2705; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 2706; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 2707; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2708; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 2709; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 2710; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 2711; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 2712; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2713; AVX-NEXT: retq 2714; 2715; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2716; AVX2: # %bb.0: 2717; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2718; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2719; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 2720; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 2721; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 2722; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero 2723; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 2724; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2725; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 2726; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 2727; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2728; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2729; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 2730; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 2731; AVX2-NEXT: vzeroupper 2732; AVX2-NEXT: retq 2733; 2734; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2735; AVX512F: # %bb.0: 2736; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2737; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2738; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2739; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2740; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2741; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2742; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3 2743; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 2744; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 2745; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 2746; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2747; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2748; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2749; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2750; AVX512F-NEXT: vzeroupper 2751; AVX512F-NEXT: retq 2752; 2753; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2754; AVX512DQ: # %bb.0: 2755; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2756; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2757; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2758; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 2759; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 2760; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2761; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3 2762; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 2763; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 2764; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 2765; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 2766; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2767; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2768; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2769; AVX512DQ-NEXT: vzeroupper 2770; AVX512DQ-NEXT: retq 2771; 2772; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: 2773; AVX512BW: # %bb.0: 2774; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2775; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2776; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2777; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 2778; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 2779; AVX512BW-NEXT: kmovd %eax, %k1 2780; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 2781; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2782; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 2783; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 2784; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2785; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2786; AVX512BW-NEXT: vzeroupper 2787; AVX512BW-NEXT: retq 2788 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2789 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2790 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2791 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95> 2792 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2793 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2794 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2795 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2796 ret void 2797} 2798 2799define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2800; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2801; SSE2: # %bb.0: 2802; SSE2-NEXT: movdqa (%rdi), %xmm0 2803; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2804; SSE2-NEXT: paddb (%rsi), %xmm0 2805; SSE2-NEXT: paddb 48(%rsi), %xmm1 2806; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] 2807; SSE2-NEXT: pand %xmm2, %xmm1 2808; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2809; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2810; SSE2-NEXT: pandn %xmm0, %xmm2 2811; SSE2-NEXT: por %xmm1, %xmm2 2812; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0] 2813; SSE2-NEXT: pand %xmm0, %xmm1 2814; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2815; SSE2-NEXT: paddb (%rdx), %xmm2 2816; SSE2-NEXT: paddb 16(%rdx), %xmm0 2817; SSE2-NEXT: paddb 32(%rdx), %xmm1 2818; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 2819; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 2820; SSE2-NEXT: movdqa %xmm2, (%rcx) 2821; SSE2-NEXT: retq 2822; 2823; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2824; SSE42: # %bb.0: 2825; SSE42-NEXT: movdqa (%rdi), %xmm1 2826; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2827; SSE42-NEXT: paddb 48(%rsi), %xmm2 2828; SSE42-NEXT: paddb (%rsi), %xmm1 2829; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] 2830; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 2831; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] 2832; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 2833; SSE42-NEXT: movdqa %xmm1, %xmm0 2834; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero 2835; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero 2836; SSE42-NEXT: paddb (%rdx), %xmm3 2837; SSE42-NEXT: paddb 16(%rdx), %xmm1 2838; SSE42-NEXT: paddb 32(%rdx), %xmm0 2839; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 2840; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 2841; SSE42-NEXT: movdqa %xmm3, (%rcx) 2842; SSE42-NEXT: retq 2843; 2844; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2845; AVX: # %bb.0: 2846; AVX-NEXT: vmovdqa (%rdi), %xmm0 2847; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2848; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2849; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2850; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 2851; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 2852; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] 2853; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 2854; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero 2855; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero 2856; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 2857; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 2858; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 2859; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 2860; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 2861; AVX-NEXT: vmovdqa %xmm1, (%rcx) 2862; AVX-NEXT: retq 2863; 2864; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2865; AVX2: # %bb.0: 2866; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 2867; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 2868; AVX2-NEXT: vmovdqa (%rdi), %xmm1 2869; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 2870; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2 2871; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero 2872; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2873; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2874; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] 2875; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 2876; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 2877; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero 2878; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 2879; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 2880; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 2881; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 2882; AVX2-NEXT: vzeroupper 2883; AVX2-NEXT: retq 2884; 2885; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2886; AVX512F: # %bb.0: 2887; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 2888; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 2889; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2890; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2891; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2892; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2893; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2 2894; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2895; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2896; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero 2897; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2898; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2899; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 2900; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 2901; AVX512F-NEXT: vzeroupper 2902; AVX512F-NEXT: retq 2903; 2904; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2905; AVX512DQ: # %bb.0: 2906; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 2907; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 2908; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2909; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2910; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2911; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2912; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2 2913; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2914; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2915; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero 2916; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 2917; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 2918; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 2919; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 2920; AVX512DQ-NEXT: vzeroupper 2921; AVX512DQ-NEXT: retq 2922; 2923; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: 2924; AVX512BW: # %bb.0: 2925; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2926; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 2927; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 2928; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 2929; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] 2930; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 2931; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2932; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2933; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero 2934; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 2935; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 2936; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 2937; AVX512BW-NEXT: vzeroupper 2938; AVX512BW-NEXT: retq 2939 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 2940 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 2941 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 2942 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95> 2943 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2944 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 2945 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 2946 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 2947 ret void 2948} 2949 2950define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 2951; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2952; SSE2: # %bb.0: 2953; SSE2-NEXT: movdqa (%rdi), %xmm0 2954; SSE2-NEXT: movdqa 48(%rdi), %xmm1 2955; SSE2-NEXT: paddb (%rsi), %xmm0 2956; SSE2-NEXT: paddb 48(%rsi), %xmm1 2957; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 2958; SSE2-NEXT: pand %xmm2, %xmm1 2959; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2960; SSE2-NEXT: pandn %xmm0, %xmm2 2961; SSE2-NEXT: por %xmm1, %xmm2 2962; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2963; SSE2-NEXT: paddb (%rdx), %xmm2 2964; SSE2-NEXT: movdqa 16(%rdx), %xmm1 2965; SSE2-NEXT: paddb %xmm0, %xmm1 2966; SSE2-NEXT: paddb 32(%rdx), %xmm0 2967; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 2968; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 2969; SSE2-NEXT: movdqa %xmm2, (%rcx) 2970; SSE2-NEXT: retq 2971; 2972; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2973; SSE42: # %bb.0: 2974; SSE42-NEXT: movdqa (%rdi), %xmm1 2975; SSE42-NEXT: movdqa 48(%rdi), %xmm2 2976; SSE42-NEXT: paddb 48(%rsi), %xmm2 2977; SSE42-NEXT: paddb (%rsi), %xmm1 2978; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 2979; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 2980; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 2981; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero 2982; SSE42-NEXT: paddb (%rdx), %xmm3 2983; SSE42-NEXT: movdqa 16(%rdx), %xmm0 2984; SSE42-NEXT: paddb %xmm1, %xmm0 2985; SSE42-NEXT: paddb 32(%rdx), %xmm1 2986; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 2987; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 2988; SSE42-NEXT: movdqa %xmm3, (%rcx) 2989; SSE42-NEXT: retq 2990; 2991; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 2992; AVX: # %bb.0: 2993; AVX-NEXT: vmovdqa (%rdi), %xmm0 2994; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 2995; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 2996; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 2997; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 2998; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551360] 2999; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 3000; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3001; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3002; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 3003; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3004; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3005; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3006; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3007; AVX-NEXT: retq 3008; 3009; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 3010; AVX2: # %bb.0: 3011; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3012; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3013; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 3014; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3015; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3016; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero 3017; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 3018; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 3019; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 3020; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3021; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3022; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3023; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3024; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3025; AVX2-NEXT: vzeroupper 3026; AVX2-NEXT: retq 3027; 3028; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 3029; AVX512F: # %bb.0: 3030; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3031; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3032; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3033; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 3034; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 3035; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3036; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 3037; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 3038; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 3039; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3040; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 3041; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3042; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3043; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3044; AVX512F-NEXT: vzeroupper 3045; AVX512F-NEXT: retq 3046; 3047; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 3048; AVX512DQ: # %bb.0: 3049; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3050; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3051; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3052; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 3053; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 3054; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3055; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 3056; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 3057; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 3058; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3059; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 3060; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3061; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3062; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3063; AVX512DQ-NEXT: vzeroupper 3064; AVX512DQ-NEXT: retq 3065; 3066; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: 3067; AVX512BW: # %bb.0: 3068; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3069; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3070; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3071; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 3072; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 3073; AVX512BW-NEXT: kmovd %eax, %k1 3074; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 3075; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3076; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3077; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3078; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3079; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3080; AVX512BW-NEXT: vzeroupper 3081; AVX512BW-NEXT: retq 3082 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3083 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3084 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3085 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3086 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3087 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3088 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3089 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3090 ret void 3091} 3092 3093define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3094; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3095; SSE2: # %bb.0: 3096; SSE2-NEXT: movdqa (%rdi), %xmm0 3097; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3098; SSE2-NEXT: paddb (%rsi), %xmm0 3099; SSE2-NEXT: paddb 48(%rsi), %xmm1 3100; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 3101; SSE2-NEXT: pand %xmm2, %xmm1 3102; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 3103; SSE2-NEXT: pandn %xmm3, %xmm2 3104; SSE2-NEXT: por %xmm1, %xmm2 3105; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 3106; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3107; SSE2-NEXT: movdqa %xmm0, %xmm1 3108; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 3109; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 3110; SSE2-NEXT: paddb (%rdx), %xmm2 3111; SSE2-NEXT: paddb 16(%rdx), %xmm0 3112; SSE2-NEXT: paddb 32(%rdx), %xmm1 3113; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3114; SSE2-NEXT: movdqa %xmm2, (%rcx) 3115; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3116; SSE2-NEXT: retq 3117; 3118; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3119; SSE42: # %bb.0: 3120; SSE42-NEXT: movdqa (%rdi), %xmm1 3121; SSE42-NEXT: movdqa 48(%rdi), %xmm2 3122; SSE42-NEXT: paddb 48(%rsi), %xmm2 3123; SSE42-NEXT: paddb (%rsi), %xmm1 3124; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] 3125; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 3126; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 3127; SSE42-NEXT: movdqa %xmm1, %xmm0 3128; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3129; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero 3130; SSE42-NEXT: paddb (%rdx), %xmm3 3131; SSE42-NEXT: paddb 16(%rdx), %xmm1 3132; SSE42-NEXT: paddb 32(%rdx), %xmm0 3133; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 3134; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 3135; SSE42-NEXT: movdqa %xmm3, (%rcx) 3136; SSE42-NEXT: retq 3137; 3138; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3139; AVX: # %bb.0: 3140; AVX-NEXT: vmovdqa (%rdi), %xmm0 3141; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3142; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3143; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3144; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 3145; AVX-NEXT: vpmovsxwd {{.*#+}} xmm3 = [4294967040,4294967295,4294967295,4294967040] 3146; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 3147; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3148; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3149; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3150; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 3151; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 3152; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 3153; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 3154; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3155; AVX-NEXT: retq 3156; 3157; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3158; AVX2: # %bb.0: 3159; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 3160; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 3161; AVX2-NEXT: vmovdqa (%rdi), %xmm1 3162; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 3163; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2 3164; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero 3165; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3166; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 3167; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 3168; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 3169; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 3170; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3171; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3172; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 3173; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 3174; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3175; AVX2-NEXT: vzeroupper 3176; AVX2-NEXT: retq 3177; 3178; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3179; AVX512F: # %bb.0: 3180; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3181; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3182; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3183; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3184; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3185; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3186; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2 3187; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3188; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3189; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3190; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3191; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3192; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3193; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3194; AVX512F-NEXT: vzeroupper 3195; AVX512F-NEXT: retq 3196; 3197; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3198; AVX512DQ: # %bb.0: 3199; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3200; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3201; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3202; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3203; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3204; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3205; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2 3206; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3207; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3208; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3209; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3210; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3211; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3212; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3213; AVX512DQ-NEXT: vzeroupper 3214; AVX512DQ-NEXT: retq 3215; 3216; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: 3217; AVX512BW: # %bb.0: 3218; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3219; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3220; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3221; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 3222; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] 3223; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 3224; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3225; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3226; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3227; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3228; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3229; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3230; AVX512BW-NEXT: vzeroupper 3231; AVX512BW-NEXT: retq 3232 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3233 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3234 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3235 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3236 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3237 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3238 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3239 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3240 ret void 3241} 3242 3243define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3244; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3245; SSE2: # %bb.0: 3246; SSE2-NEXT: movdqa (%rdi), %xmm0 3247; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3248; SSE2-NEXT: paddb 48(%rsi), %xmm1 3249; SSE2-NEXT: paddb (%rsi), %xmm0 3250; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3251; SSE2-NEXT: pand %xmm2, %xmm1 3252; SSE2-NEXT: pandn %xmm0, %xmm2 3253; SSE2-NEXT: por %xmm1, %xmm2 3254; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3255; SSE2-NEXT: paddb (%rdx), %xmm2 3256; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3257; SSE2-NEXT: paddb %xmm0, %xmm1 3258; SSE2-NEXT: paddb 32(%rdx), %xmm0 3259; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3260; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3261; SSE2-NEXT: movdqa %xmm2, (%rcx) 3262; SSE2-NEXT: retq 3263; 3264; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3265; SSE42: # %bb.0: 3266; SSE42-NEXT: movdqa (%rdi), %xmm1 3267; SSE42-NEXT: movdqa 48(%rdi), %xmm2 3268; SSE42-NEXT: paddb 48(%rsi), %xmm2 3269; SSE42-NEXT: paddb (%rsi), %xmm1 3270; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3271; SSE42-NEXT: movdqa %xmm1, %xmm3 3272; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 3273; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3274; SSE42-NEXT: paddb (%rdx), %xmm3 3275; SSE42-NEXT: movdqa 16(%rdx), %xmm0 3276; SSE42-NEXT: paddb %xmm1, %xmm0 3277; SSE42-NEXT: paddb 32(%rdx), %xmm1 3278; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 3279; SSE42-NEXT: movdqa %xmm3, (%rcx) 3280; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3281; SSE42-NEXT: retq 3282; 3283; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3284; AVX: # %bb.0: 3285; AVX-NEXT: vmovdqa (%rdi), %xmm0 3286; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3287; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3288; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3289; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 3290; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 3291; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3292; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3293; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 3294; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3295; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3296; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3297; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3298; AVX-NEXT: retq 3299; 3300; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3301; AVX2: # %bb.0: 3302; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3303; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3304; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 3305; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3306; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 3307; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm3 = [255,0,255,0] 3308; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 3309; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 3310; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] 3311; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 3312; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 3313; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3314; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3315; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3316; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3317; AVX2-NEXT: vzeroupper 3318; AVX2-NEXT: retq 3319; 3320; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3321; AVX512F: # %bb.0: 3322; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 3323; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3324; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3325; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3326; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3327; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 3328; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 3329; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] 3330; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 3331; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 3332; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3333; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 3334; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3335; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3336; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3337; AVX512F-NEXT: vzeroupper 3338; AVX512F-NEXT: retq 3339; 3340; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3341; AVX512DQ: # %bb.0: 3342; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 3343; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 3344; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3345; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3346; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3347; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 3348; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 3349; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] 3350; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 3351; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) 3352; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3353; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 3354; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3355; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3356; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3357; AVX512DQ-NEXT: vzeroupper 3358; AVX512DQ-NEXT: retq 3359; 3360; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: 3361; AVX512BW: # %bb.0: 3362; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3363; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3364; AVX512BW-NEXT: movw $1, %ax 3365; AVX512BW-NEXT: kmovd %eax, %k1 3366; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} {z} 3367; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 3368; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3369; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 3370; AVX512BW-NEXT: kmovd %eax, %k1 3371; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} 3372; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 3373; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 3374; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3375; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3376; AVX512BW-NEXT: vzeroupper 3377; AVX512BW-NEXT: retq 3378 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3379 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3380 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3381 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3382 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3383 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3384 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3385 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3386 ret void 3387} 3388 3389define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3390; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3391; SSE2: # %bb.0: 3392; SSE2-NEXT: movdqa (%rdi), %xmm0 3393; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3394; SSE2-NEXT: paddb 48(%rsi), %xmm1 3395; SSE2-NEXT: paddb (%rsi), %xmm0 3396; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3397; SSE2-NEXT: pand %xmm2, %xmm1 3398; SSE2-NEXT: pandn %xmm0, %xmm2 3399; SSE2-NEXT: por %xmm1, %xmm2 3400; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 3401; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3402; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 3403; SSE2-NEXT: movaps 32(%rdx), %xmm1 3404; SSE2-NEXT: paddb (%rdx), %xmm2 3405; SSE2-NEXT: paddb 16(%rdx), %xmm0 3406; SSE2-NEXT: movaps %xmm1, 32(%rcx) 3407; SSE2-NEXT: movdqa %xmm2, (%rcx) 3408; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3409; SSE2-NEXT: retq 3410; 3411; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3412; SSE42: # %bb.0: 3413; SSE42-NEXT: movdqa (%rdi), %xmm1 3414; SSE42-NEXT: movdqa 48(%rdi), %xmm2 3415; SSE42-NEXT: paddb 48(%rsi), %xmm2 3416; SSE42-NEXT: paddb (%rsi), %xmm1 3417; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3418; SSE42-NEXT: movdqa %xmm1, %xmm3 3419; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 3420; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero 3421; SSE42-NEXT: movaps 32(%rdx), %xmm0 3422; SSE42-NEXT: paddb (%rdx), %xmm3 3423; SSE42-NEXT: paddb 16(%rdx), %xmm1 3424; SSE42-NEXT: movaps %xmm0, 32(%rcx) 3425; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 3426; SSE42-NEXT: movdqa %xmm3, (%rcx) 3427; SSE42-NEXT: retq 3428; 3429; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3430; AVX: # %bb.0: 3431; AVX-NEXT: vmovdqa (%rdi), %xmm0 3432; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3433; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3434; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3435; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 3436; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 3437; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 3438; AVX-NEXT: vmovaps 32(%rdx), %ymm2 3439; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3440; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3441; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 3442; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3443; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3444; AVX-NEXT: vzeroupper 3445; AVX-NEXT: retq 3446; 3447; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3448; AVX2: # %bb.0: 3449; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 3450; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 3451; AVX2-NEXT: vmovdqa (%rdi), %xmm1 3452; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 3453; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 3454; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [255,0,18446744073709551615,18446744073709551360] 3455; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 3456; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 3457; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] 3458; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3459; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 3460; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3461; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 3462; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 3463; AVX2-NEXT: vzeroupper 3464; AVX2-NEXT: retq 3465; 3466; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3467; AVX512F: # %bb.0: 3468; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3469; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3470; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3471; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3472; AVX512F-NEXT: vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0)) 3473; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 3474; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3475; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3476; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3477; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 3478; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 3479; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 3480; AVX512F-NEXT: vzeroupper 3481; AVX512F-NEXT: retq 3482; 3483; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3484; AVX512DQ: # %bb.0: 3485; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3486; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3487; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3488; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3489; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0)) 3490; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 3491; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3492; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3493; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 3494; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 3495; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) 3496; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 3497; AVX512DQ-NEXT: vzeroupper 3498; AVX512DQ-NEXT: retq 3499; 3500; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: 3501; AVX512BW: # %bb.0: 3502; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3503; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3504; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 3505; AVX512BW-NEXT: movw $1, %ax 3506; AVX512BW-NEXT: kmovd %eax, %k1 3507; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3508; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 3509; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 3510; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3511; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3512; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 3513; AVX512BW-NEXT: vzeroupper 3514; AVX512BW-NEXT: retq 3515 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3516 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3517 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3518 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 3519 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3520 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3521 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3522 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3523 ret void 3524} 3525 3526define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3527; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3528; SSE2: # %bb.0: 3529; SSE2-NEXT: movdqa (%rdi), %xmm0 3530; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3531; SSE2-NEXT: paddb 48(%rsi), %xmm1 3532; SSE2-NEXT: paddb (%rsi), %xmm0 3533; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3534; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 3535; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 3536; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3537; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 3538; SSE2-NEXT: movdqa %xmm0, %xmm2 3539; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3540; SSE2-NEXT: pxor %xmm1, %xmm1 3541; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3542; SSE2-NEXT: paddb (%rdx), %xmm2 3543; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3544; SSE2-NEXT: paddb %xmm0, %xmm1 3545; SSE2-NEXT: paddb 32(%rdx), %xmm0 3546; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3547; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3548; SSE2-NEXT: movdqa %xmm2, (%rcx) 3549; SSE2-NEXT: retq 3550; 3551; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3552; SSE42: # %bb.0: 3553; SSE42-NEXT: movdqa (%rdi), %xmm0 3554; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3555; SSE42-NEXT: paddb (%rsi), %xmm0 3556; SSE42-NEXT: paddb 48(%rsi), %xmm1 3557; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3558; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3559; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3560; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3561; SSE42-NEXT: paddb (%rdx), %xmm0 3562; SSE42-NEXT: movdqa 16(%rdx), %xmm1 3563; SSE42-NEXT: paddb %xmm2, %xmm1 3564; SSE42-NEXT: paddb 32(%rdx), %xmm2 3565; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3566; SSE42-NEXT: movdqa %xmm0, (%rcx) 3567; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 3568; SSE42-NEXT: retq 3569; 3570; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3571; AVX: # %bb.0: 3572; AVX-NEXT: vmovdqa (%rdi), %xmm0 3573; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3574; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3575; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3576; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3577; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] 3578; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3579; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3580; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 3581; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3582; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 3583; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 3584; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3585; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3586; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3587; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3588; AVX-NEXT: retq 3589; 3590; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3591; AVX2: # %bb.0: 3592; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3593; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3594; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3595; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3596; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 3597; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3598; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3599; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3600; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3601; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3602; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3603; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3604; AVX2-NEXT: vzeroupper 3605; AVX2-NEXT: retq 3606; 3607; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3608; AVX512F: # %bb.0: 3609; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 3610; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 3611; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3612; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3613; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 3614; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3615; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3616; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3617; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3618; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3619; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 3620; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 3621; AVX512F-NEXT: vzeroupper 3622; AVX512F-NEXT: retq 3623; 3624; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3625; AVX512DQ: # %bb.0: 3626; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 3627; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 3628; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3629; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3630; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 3631; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3632; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3633; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3634; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3635; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3636; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 3637; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 3638; AVX512DQ-NEXT: vzeroupper 3639; AVX512DQ-NEXT: retq 3640; 3641; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3642; AVX512BW-SLOW: # %bb.0: 3643; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 3644; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] 3645; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3646; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3647; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 3648; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 3649; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3650; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3651; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3652; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 3653; AVX512BW-SLOW-NEXT: vzeroupper 3654; AVX512BW-SLOW-NEXT: retq 3655; 3656; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: 3657; AVX512BW-FAST: # %bb.0: 3658; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 3659; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] 3660; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3661; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 3662; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 3663; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero 3664; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3665; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3666; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 3667; AVX512BW-FAST-NEXT: vzeroupper 3668; AVX512BW-FAST-NEXT: retq 3669 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3670 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3671 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3672 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3673 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47> 3674 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 3675 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3676 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3677 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3678 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3679 ret void 3680} 3681 3682define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3683; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3684; SSE2: # %bb.0: 3685; SSE2-NEXT: movdqa (%rdi), %xmm0 3686; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3687; SSE2-NEXT: paddb (%rsi), %xmm0 3688; SSE2-NEXT: paddb 48(%rsi), %xmm1 3689; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] 3690; SSE2-NEXT: pand %xmm2, %xmm1 3691; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3692; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3693; SSE2-NEXT: pandn %xmm0, %xmm2 3694; SSE2-NEXT: por %xmm1, %xmm2 3695; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0] 3696; SSE2-NEXT: pand %xmm0, %xmm1 3697; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3698; SSE2-NEXT: paddb (%rdx), %xmm2 3699; SSE2-NEXT: paddb 16(%rdx), %xmm0 3700; SSE2-NEXT: paddb 32(%rdx), %xmm1 3701; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 3702; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 3703; SSE2-NEXT: movdqa %xmm2, (%rcx) 3704; SSE2-NEXT: retq 3705; 3706; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3707; SSE42: # %bb.0: 3708; SSE42-NEXT: movdqa (%rdi), %xmm0 3709; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3710; SSE42-NEXT: paddb 48(%rsi), %xmm1 3711; SSE42-NEXT: paddb (%rsi), %xmm0 3712; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3713; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3714; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3715; SSE42-NEXT: pxor %xmm2, %xmm2 3716; SSE42-NEXT: pxor %xmm3, %xmm3 3717; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] 3718; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] 3719; SSE42-NEXT: paddb (%rdx), %xmm1 3720; SSE42-NEXT: paddb 16(%rdx), %xmm2 3721; SSE42-NEXT: paddb 32(%rdx), %xmm3 3722; SSE42-NEXT: movdqa %xmm3, 32(%rcx) 3723; SSE42-NEXT: movdqa %xmm1, (%rcx) 3724; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 3725; SSE42-NEXT: retq 3726; 3727; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3728; AVX: # %bb.0: 3729; AVX-NEXT: vmovdqa (%rdi), %xmm0 3730; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3731; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3732; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3733; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 3734; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3735; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3736; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 3737; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3738; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3739; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 3740; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 3741; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3742; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3743; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3744; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3745; AVX-NEXT: retq 3746; 3747; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3748; AVX2: # %bb.0: 3749; AVX2-NEXT: vmovdqa (%rdi), %xmm0 3750; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 3751; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3752; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3753; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 3754; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3755; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 3756; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3757; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3758; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3759; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3760; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3761; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 3762; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 3763; AVX2-NEXT: vzeroupper 3764; AVX2-NEXT: retq 3765; 3766; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3767; AVX512F-SLOW: # %bb.0: 3768; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 3769; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 3770; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3771; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3772; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 3773; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3774; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3775; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] 3776; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3777; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3778; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3779; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3780; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3781; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3782; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3783; AVX512F-SLOW-NEXT: vzeroupper 3784; AVX512F-SLOW-NEXT: retq 3785; 3786; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3787; AVX512F-FAST: # %bb.0: 3788; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 3789; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 3790; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3791; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3792; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 3793; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 3794; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 3795; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] 3796; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3797; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero 3798; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3799; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3800; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3801; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3802; AVX512F-FAST-NEXT: vzeroupper 3803; AVX512F-FAST-NEXT: retq 3804; 3805; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3806; AVX512DQ-SLOW: # %bb.0: 3807; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 3808; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 3809; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3810; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3811; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 3812; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 3813; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3814; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] 3815; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3816; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3817; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3818; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3819; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3820; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3821; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3822; AVX512DQ-SLOW-NEXT: vzeroupper 3823; AVX512DQ-SLOW-NEXT: retq 3824; 3825; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3826; AVX512DQ-FAST: # %bb.0: 3827; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 3828; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 3829; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3830; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3831; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 3832; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 3833; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 3834; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] 3835; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3836; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero 3837; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3838; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3839; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3840; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3841; AVX512DQ-FAST-NEXT: vzeroupper 3842; AVX512DQ-FAST-NEXT: retq 3843; 3844; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3845; AVX512BW-SLOW: # %bb.0: 3846; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 3847; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] 3848; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3849; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3850; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 3851; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 3852; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3853; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 3854; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3855; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3856; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 3857; AVX512BW-SLOW-NEXT: vzeroupper 3858; AVX512BW-SLOW-NEXT: retq 3859; 3860; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: 3861; AVX512BW-FAST: # %bb.0: 3862; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 3863; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] 3864; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 3865; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 3866; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 3867; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero 3868; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 3869; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 3870; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 3871; AVX512BW-FAST-NEXT: vzeroupper 3872; AVX512BW-FAST-NEXT: retq 3873 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 3874 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 3875 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 3876 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 3877 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47> 3878 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 3879 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3880 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 3881 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 3882 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 3883 ret void 3884} 3885 3886define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 3887; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3888; SSE2: # %bb.0: 3889; SSE2-NEXT: movdqa (%rdi), %xmm0 3890; SSE2-NEXT: movdqa 48(%rdi), %xmm1 3891; SSE2-NEXT: paddb (%rsi), %xmm0 3892; SSE2-NEXT: paddb 48(%rsi), %xmm1 3893; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 3894; SSE2-NEXT: pand %xmm2, %xmm1 3895; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3896; SSE2-NEXT: pandn %xmm0, %xmm2 3897; SSE2-NEXT: por %xmm1, %xmm2 3898; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3899; SSE2-NEXT: paddb (%rdx), %xmm2 3900; SSE2-NEXT: movdqa 16(%rdx), %xmm1 3901; SSE2-NEXT: paddb %xmm0, %xmm1 3902; SSE2-NEXT: paddb 32(%rdx), %xmm0 3903; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 3904; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 3905; SSE2-NEXT: movdqa %xmm2, (%rcx) 3906; SSE2-NEXT: retq 3907; 3908; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3909; SSE42: # %bb.0: 3910; SSE42-NEXT: movdqa (%rdi), %xmm0 3911; SSE42-NEXT: movdqa 48(%rdi), %xmm1 3912; SSE42-NEXT: paddb 48(%rsi), %xmm1 3913; SSE42-NEXT: paddb (%rsi), %xmm0 3914; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3915; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 3916; SSE42-NEXT: pxor %xmm2, %xmm2 3917; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 3918; SSE42-NEXT: paddb (%rdx), %xmm1 3919; SSE42-NEXT: movdqa 16(%rdx), %xmm0 3920; SSE42-NEXT: paddb %xmm2, %xmm0 3921; SSE42-NEXT: paddb 32(%rdx), %xmm2 3922; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 3923; SSE42-NEXT: movdqa %xmm1, (%rcx) 3924; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 3925; SSE42-NEXT: retq 3926; 3927; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3928; AVX: # %bb.0: 3929; AVX-NEXT: vmovdqa (%rdi), %xmm0 3930; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 3931; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3932; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3933; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3934; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 3935; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 3936; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 3937; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 3938; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 3939; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 3940; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 3941; AVX-NEXT: vmovdqa %xmm1, (%rcx) 3942; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 3943; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 3944; AVX-NEXT: retq 3945; 3946; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3947; AVX2-SLOW: # %bb.0: 3948; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 3949; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 3950; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3951; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3952; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 3953; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 3954; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3955; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 3956; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 3957; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 3958; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3959; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3960; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 3961; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 3962; AVX2-SLOW-NEXT: vzeroupper 3963; AVX2-SLOW-NEXT: retq 3964; 3965; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3966; AVX2-FAST-PERLANE: # %bb.0: 3967; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 3968; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 3969; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3970; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3971; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 3972; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 3973; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3974; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero 3975; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3976; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3977; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 3978; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 3979; AVX2-FAST-PERLANE-NEXT: vzeroupper 3980; AVX2-FAST-PERLANE-NEXT: retq 3981; 3982; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 3983; AVX2-FAST: # %bb.0: 3984; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 3985; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 3986; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 3987; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 3988; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 3989; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 3990; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3991; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero 3992; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 3993; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 3994; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 3995; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 3996; AVX2-FAST-NEXT: vzeroupper 3997; AVX2-FAST-NEXT: retq 3998; 3999; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4000; AVX512F-SLOW: # %bb.0: 4001; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4002; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 4003; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4004; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4005; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 4006; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 4007; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4008; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 4009; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4010; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 4011; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4012; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4013; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4014; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4015; AVX512F-SLOW-NEXT: vzeroupper 4016; AVX512F-SLOW-NEXT: retq 4017; 4018; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4019; AVX512F-FAST: # %bb.0: 4020; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 4021; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 4022; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4023; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4024; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 4025; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 4026; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4027; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero 4028; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4029; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4030; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4031; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4032; AVX512F-FAST-NEXT: vzeroupper 4033; AVX512F-FAST-NEXT: retq 4034; 4035; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4036; AVX512DQ-SLOW: # %bb.0: 4037; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4038; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 4039; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4040; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4041; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 4042; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 4043; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4044; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 4045; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4046; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 4047; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4048; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4049; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4050; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4051; AVX512DQ-SLOW-NEXT: vzeroupper 4052; AVX512DQ-SLOW-NEXT: retq 4053; 4054; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4055; AVX512DQ-FAST: # %bb.0: 4056; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 4057; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 4058; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4059; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4060; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 4061; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] 4062; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4063; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero 4064; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4065; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4066; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4067; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4068; AVX512DQ-FAST-NEXT: vzeroupper 4069; AVX512DQ-FAST-NEXT: retq 4070; 4071; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4072; AVX512BW-SLOW: # %bb.0: 4073; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4074; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] 4075; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4076; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4077; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 4078; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 4079; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4080; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 4081; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4082; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4083; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4084; AVX512BW-SLOW-NEXT: vzeroupper 4085; AVX512BW-SLOW-NEXT: retq 4086; 4087; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: 4088; AVX512BW-FAST: # %bb.0: 4089; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4090; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] 4091; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4092; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4093; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 4094; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero 4095; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4096; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4097; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4098; AVX512BW-FAST-NEXT: vzeroupper 4099; AVX512BW-FAST-NEXT: retq 4100 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4101 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4102 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4103 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4104 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47> 4105 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 4106 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4107 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4108 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4109 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4110 ret void 4111} 4112 4113define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4114; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4115; SSE2: # %bb.0: 4116; SSE2-NEXT: movdqa (%rdi), %xmm0 4117; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4118; SSE2-NEXT: paddb (%rsi), %xmm0 4119; SSE2-NEXT: paddb 48(%rsi), %xmm1 4120; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] 4121; SSE2-NEXT: pand %xmm2, %xmm1 4122; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] 4123; SSE2-NEXT: pandn %xmm3, %xmm2 4124; SSE2-NEXT: por %xmm1, %xmm2 4125; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 4126; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4127; SSE2-NEXT: movdqa %xmm0, %xmm1 4128; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 4129; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4130; SSE2-NEXT: paddb (%rdx), %xmm2 4131; SSE2-NEXT: paddb 16(%rdx), %xmm0 4132; SSE2-NEXT: paddb 32(%rdx), %xmm1 4133; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 4134; SSE2-NEXT: movdqa %xmm2, (%rcx) 4135; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4136; SSE2-NEXT: retq 4137; 4138; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4139; SSE42: # %bb.0: 4140; SSE42-NEXT: movdqa (%rdi), %xmm0 4141; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4142; SSE42-NEXT: paddb 48(%rsi), %xmm1 4143; SSE42-NEXT: paddb (%rsi), %xmm0 4144; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4145; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] 4146; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] 4147; SSE42-NEXT: pxor %xmm3, %xmm3 4148; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] 4149; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4150; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] 4151; SSE42-NEXT: paddb (%rdx), %xmm2 4152; SSE42-NEXT: paddb 16(%rdx), %xmm0 4153; SSE42-NEXT: paddb 32(%rdx), %xmm1 4154; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 4155; SSE42-NEXT: movdqa %xmm2, (%rcx) 4156; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4157; SSE42-NEXT: retq 4158; 4159; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4160; AVX: # %bb.0: 4161; AVX-NEXT: vmovdqa (%rdi), %xmm0 4162; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4163; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4164; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4165; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4166; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] 4167; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 4168; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 4169; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4170; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] 4171; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4172; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4173; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 4174; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4175; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4176; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4177; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4178; AVX-NEXT: retq 4179; 4180; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4181; AVX2: # %bb.0: 4182; AVX2-NEXT: vmovdqa (%rdi), %xmm0 4183; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4184; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4185; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4186; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 4187; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 4188; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 4189; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4190; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4191; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] 4192; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4193; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4194; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4195; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4196; AVX2-NEXT: vzeroupper 4197; AVX2-NEXT: retq 4198; 4199; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4200; AVX512F-SLOW: # %bb.0: 4201; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4202; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 4203; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4204; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4205; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 4206; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 4207; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4208; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] 4209; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 4210; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4211; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] 4212; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4213; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4214; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4215; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4216; AVX512F-SLOW-NEXT: vzeroupper 4217; AVX512F-SLOW-NEXT: retq 4218; 4219; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4220; AVX512F-FAST: # %bb.0: 4221; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 4222; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 4223; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4224; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4225; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 4226; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] 4227; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 4228; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] 4229; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 4230; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4231; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4232; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4233; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4234; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4235; AVX512F-FAST-NEXT: vzeroupper 4236; AVX512F-FAST-NEXT: retq 4237; 4238; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4239; AVX512DQ-SLOW: # %bb.0: 4240; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4241; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 4242; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4243; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4244; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 4245; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] 4246; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4247; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] 4248; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 4249; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4250; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] 4251; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4252; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4253; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4254; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4255; AVX512DQ-SLOW-NEXT: vzeroupper 4256; AVX512DQ-SLOW-NEXT: retq 4257; 4258; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4259; AVX512DQ-FAST: # %bb.0: 4260; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 4261; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 4262; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4263; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4264; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 4265; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] 4266; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 4267; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] 4268; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 4269; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4270; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4271; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4272; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4273; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4274; AVX512DQ-FAST-NEXT: vzeroupper 4275; AVX512DQ-FAST-NEXT: retq 4276; 4277; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4278; AVX512BW-SLOW: # %bb.0: 4279; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4280; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] 4281; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4282; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4283; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 4284; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 4285; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4286; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] 4287; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4288; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4289; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4290; AVX512BW-SLOW-NEXT: vzeroupper 4291; AVX512BW-SLOW-NEXT: retq 4292; 4293; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: 4294; AVX512BW-FAST: # %bb.0: 4295; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4296; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] 4297; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4298; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4299; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 4300; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4301; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4302; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4303; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4304; AVX512BW-FAST-NEXT: vzeroupper 4305; AVX512BW-FAST-NEXT: retq 4306 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4307 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4308 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4309 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4310 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47> 4311 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 4312 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4313 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4314 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4315 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4316 ret void 4317} 4318 4319define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4320; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4321; SSE2: # %bb.0: 4322; SSE2-NEXT: movdqa (%rdi), %xmm0 4323; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4324; SSE2-NEXT: paddb 48(%rsi), %xmm1 4325; SSE2-NEXT: paddb (%rsi), %xmm0 4326; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 4327; SSE2-NEXT: pand %xmm2, %xmm1 4328; SSE2-NEXT: pandn %xmm0, %xmm2 4329; SSE2-NEXT: por %xmm1, %xmm2 4330; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4331; SSE2-NEXT: paddb (%rdx), %xmm2 4332; SSE2-NEXT: movdqa 16(%rdx), %xmm1 4333; SSE2-NEXT: paddb %xmm0, %xmm1 4334; SSE2-NEXT: paddb 32(%rdx), %xmm0 4335; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4336; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 4337; SSE2-NEXT: movdqa %xmm2, (%rcx) 4338; SSE2-NEXT: retq 4339; 4340; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4341; SSE42: # %bb.0: 4342; SSE42-NEXT: movdqa (%rdi), %xmm0 4343; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4344; SSE42-NEXT: paddb (%rsi), %xmm0 4345; SSE42-NEXT: paddb 48(%rsi), %xmm1 4346; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4347; SSE42-NEXT: pxor %xmm2, %xmm2 4348; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4349; SSE42-NEXT: paddb (%rdx), %xmm1 4350; SSE42-NEXT: movdqa 16(%rdx), %xmm0 4351; SSE42-NEXT: paddb %xmm2, %xmm0 4352; SSE42-NEXT: paddb 32(%rdx), %xmm2 4353; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 4354; SSE42-NEXT: movdqa %xmm1, (%rcx) 4355; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4356; SSE42-NEXT: retq 4357; 4358; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4359; AVX: # %bb.0: 4360; AVX-NEXT: vmovdqa (%rdi), %xmm0 4361; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4362; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4363; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4364; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4365; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 4366; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4367; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4368; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 4369; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 4370; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4371; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4372; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4373; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 4374; AVX-NEXT: retq 4375; 4376; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4377; AVX2: # %bb.0: 4378; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4379; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4380; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4381; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4382; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4383; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4384; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4385; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4386; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4387; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4388; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4389; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 4390; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 4391; AVX2-NEXT: vzeroupper 4392; AVX2-NEXT: retq 4393; 4394; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4395; AVX512F: # %bb.0: 4396; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4397; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4398; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 4399; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4400; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4401; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4402; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4403; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 4404; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4405; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4406; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4407; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4408; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4409; AVX512F-NEXT: vzeroupper 4410; AVX512F-NEXT: retq 4411; 4412; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4413; AVX512DQ: # %bb.0: 4414; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4415; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4416; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 4417; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4418; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 4419; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4420; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 4421; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 4422; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4423; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4424; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4425; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 4426; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 4427; AVX512DQ-NEXT: vzeroupper 4428; AVX512DQ-NEXT: retq 4429; 4430; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: 4431; AVX512BW: # %bb.0: 4432; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4433; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] 4434; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4435; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4436; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 4437; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4438; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 4439; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4440; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4441; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4442; AVX512BW-NEXT: vzeroupper 4443; AVX512BW-NEXT: retq 4444 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4445 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4446 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4447 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4448 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4449 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 4450 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4451 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4452 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4453 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4454 ret void 4455} 4456 4457define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4458; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4459; SSE2: # %bb.0: 4460; SSE2-NEXT: movdqa (%rdi), %xmm0 4461; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4462; SSE2-NEXT: paddb 48(%rsi), %xmm1 4463; SSE2-NEXT: paddb (%rsi), %xmm0 4464; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 4465; SSE2-NEXT: pand %xmm2, %xmm1 4466; SSE2-NEXT: pandn %xmm0, %xmm2 4467; SSE2-NEXT: por %xmm1, %xmm2 4468; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] 4469; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4470; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4471; SSE2-NEXT: movaps 32(%rdx), %xmm1 4472; SSE2-NEXT: paddb (%rdx), %xmm2 4473; SSE2-NEXT: paddb 16(%rdx), %xmm0 4474; SSE2-NEXT: movaps %xmm1, 32(%rcx) 4475; SSE2-NEXT: movdqa %xmm2, (%rcx) 4476; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 4477; SSE2-NEXT: retq 4478; 4479; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4480; SSE42: # %bb.0: 4481; SSE42-NEXT: movdqa (%rdi), %xmm0 4482; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4483; SSE42-NEXT: paddb (%rsi), %xmm0 4484; SSE42-NEXT: paddb 48(%rsi), %xmm1 4485; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4486; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4487; SSE42-NEXT: pxor %xmm2, %xmm2 4488; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] 4489; SSE42-NEXT: movaps 32(%rdx), %xmm0 4490; SSE42-NEXT: paddb (%rdx), %xmm1 4491; SSE42-NEXT: paddb 16(%rdx), %xmm2 4492; SSE42-NEXT: movaps %xmm0, 32(%rcx) 4493; SSE42-NEXT: movdqa %xmm1, (%rcx) 4494; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 4495; SSE42-NEXT: retq 4496; 4497; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4498; AVX: # %bb.0: 4499; AVX-NEXT: vmovdqa (%rdi), %xmm0 4500; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4501; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4502; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4503; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4504; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4505; AVX-NEXT: vmovaps 32(%rdx), %ymm2 4506; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4507; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 4508; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 4509; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 4510; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4511; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 4512; AVX-NEXT: vzeroupper 4513; AVX-NEXT: retq 4514; 4515; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4516; AVX2: # %bb.0: 4517; AVX2-NEXT: vmovdqa (%rdi), %xmm0 4518; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 4519; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4520; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4521; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4522; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 4523; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4524; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] 4525; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4526; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 4527; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4528; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 4529; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 4530; AVX2-NEXT: vzeroupper 4531; AVX2-NEXT: retq 4532; 4533; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4534; AVX512F: # %bb.0: 4535; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 4536; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 4537; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4538; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4539; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4540; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 4541; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 4542; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] 4543; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4544; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4545; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 4546; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 4547; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 4548; AVX512F-NEXT: vzeroupper 4549; AVX512F-NEXT: retq 4550; 4551; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4552; AVX512DQ: # %bb.0: 4553; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 4554; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 4555; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4556; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4557; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 4558; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 4559; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 4560; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] 4561; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4562; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 4563; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 4564; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) 4565; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 4566; AVX512DQ-NEXT: vzeroupper 4567; AVX512DQ-NEXT: retq 4568; 4569; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: 4570; AVX512BW: # %bb.0: 4571; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 4572; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] 4573; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4574; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4575; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 4576; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 4577; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4578; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 4579; AVX512BW-NEXT: vzeroupper 4580; AVX512BW-NEXT: retq 4581 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4582 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4583 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4584 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 4585 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 4586 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8> 4587 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4588 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4589 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4590 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4591 ret void 4592} 4593 4594define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4595; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4596; SSE2: # %bb.0: 4597; SSE2-NEXT: movdqa (%rdi), %xmm0 4598; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4599; SSE2-NEXT: paddb (%rsi), %xmm0 4600; SSE2-NEXT: paddb 48(%rsi), %xmm1 4601; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 4602; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 4603; SSE2-NEXT: movdqa %xmm0, %xmm2 4604; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4605; SSE2-NEXT: pxor %xmm1, %xmm1 4606; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4607; SSE2-NEXT: paddb (%rdx), %xmm2 4608; SSE2-NEXT: movdqa 16(%rdx), %xmm1 4609; SSE2-NEXT: paddb %xmm0, %xmm1 4610; SSE2-NEXT: paddb 32(%rdx), %xmm0 4611; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 4612; SSE2-NEXT: movdqa %xmm2, (%rcx) 4613; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 4614; SSE2-NEXT: retq 4615; 4616; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4617; SSE42: # %bb.0: 4618; SSE42-NEXT: movdqa (%rdi), %xmm0 4619; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4620; SSE42-NEXT: paddb 48(%rsi), %xmm1 4621; SSE42-NEXT: paddb (%rsi), %xmm0 4622; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4623; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 4624; SSE42-NEXT: pxor %xmm2, %xmm2 4625; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 4626; SSE42-NEXT: paddb (%rdx), %xmm1 4627; SSE42-NEXT: movdqa 16(%rdx), %xmm0 4628; SSE42-NEXT: paddb %xmm2, %xmm0 4629; SSE42-NEXT: paddb 32(%rdx), %xmm2 4630; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 4631; SSE42-NEXT: movdqa %xmm1, (%rcx) 4632; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 4633; SSE42-NEXT: retq 4634; 4635; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4636; AVX: # %bb.0: 4637; AVX-NEXT: vmovdqa (%rdi), %xmm0 4638; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4639; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4640; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4641; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 4642; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] 4643; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] 4644; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4645; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4646; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 4647; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 4648; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 4649; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 4650; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 4651; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4652; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4653; AVX-NEXT: vmovdqa %xmm2, (%rcx) 4654; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 4655; AVX-NEXT: vzeroupper 4656; AVX-NEXT: retq 4657; 4658; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4659; AVX2-SLOW: # %bb.0: 4660; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 4661; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 4662; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4663; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4664; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 4665; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] 4666; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4667; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] 4668; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 4669; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4670; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 4671; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4672; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4673; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4674; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4675; AVX2-SLOW-NEXT: vzeroupper 4676; AVX2-SLOW-NEXT: retq 4677; 4678; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4679; AVX2-FAST-PERLANE: # %bb.0: 4680; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 4681; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 4682; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4683; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4684; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 4685; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] 4686; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 4687; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] 4688; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 4689; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4690; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4691; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 4692; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 4693; AVX2-FAST-PERLANE-NEXT: vzeroupper 4694; AVX2-FAST-PERLANE-NEXT: retq 4695; 4696; AVX2-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4697; AVX2-FAST: # %bb.0: 4698; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 4699; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 4700; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4701; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4702; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 4703; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] 4704; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4705; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] 4706; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 4707; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4708; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4709; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4710; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4711; AVX2-FAST-NEXT: vzeroupper 4712; AVX2-FAST-NEXT: retq 4713; 4714; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4715; AVX512F: # %bb.0: 4716; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4717; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4718; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4719; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4720; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4721; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 4722; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] 4723; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 4724; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4725; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4726; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 4727; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4728; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4729; AVX512F-NEXT: vzeroupper 4730; AVX512F-NEXT: retq 4731; 4732; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4733; AVX512DQ: # %bb.0: 4734; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4735; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4736; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4737; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4738; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4739; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 4740; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] 4741; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 4742; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4743; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4744; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 4745; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 4746; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 4747; AVX512DQ-NEXT: vzeroupper 4748; AVX512DQ-NEXT: retq 4749; 4750; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4751; AVX512BW-SLOW: # %bb.0: 4752; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4753; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4754; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] 4755; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 4756; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 4757; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4758; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 4759; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4760; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] 4761; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4762; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4763; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4764; AVX512BW-SLOW-NEXT: vzeroupper 4765; AVX512BW-SLOW-NEXT: retq 4766; 4767; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: 4768; AVX512BW-FAST: # %bb.0: 4769; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4770; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4771; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] 4772; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 4773; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4774; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] 4775; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 4776; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4777; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4778; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4779; AVX512BW-FAST-NEXT: vzeroupper 4780; AVX512BW-FAST-NEXT: retq 4781 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4782 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4783 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4784 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4785 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23> 4786 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8> 4787 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4788 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4789 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4790 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4791 ret void 4792} 4793 4794define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4795; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4796; SSE2: # %bb.0: 4797; SSE2-NEXT: movdqa (%rdi), %xmm0 4798; SSE2-NEXT: movdqa 48(%rdi), %xmm1 4799; SSE2-NEXT: paddb 48(%rsi), %xmm1 4800; SSE2-NEXT: paddb (%rsi), %xmm0 4801; SSE2-NEXT: xorps %xmm2, %xmm2 4802; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 4803; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2] 4804; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 4805; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1] 4806; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] 4807; SSE2-NEXT: paddb (%rdx), %xmm0 4808; SSE2-NEXT: paddb 16(%rdx), %xmm2 4809; SSE2-NEXT: paddb 32(%rdx), %xmm1 4810; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 4811; SSE2-NEXT: movdqa %xmm0, (%rcx) 4812; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 4813; SSE2-NEXT: retq 4814; 4815; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4816; SSE42: # %bb.0: 4817; SSE42-NEXT: movdqa (%rdi), %xmm0 4818; SSE42-NEXT: movdqa 48(%rdi), %xmm1 4819; SSE42-NEXT: paddb 48(%rsi), %xmm1 4820; SSE42-NEXT: paddb (%rsi), %xmm0 4821; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 4822; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] 4823; SSE42-NEXT: pxor %xmm1, %xmm1 4824; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4825; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 4826; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] 4827; SSE42-NEXT: paddb (%rdx), %xmm2 4828; SSE42-NEXT: paddb 16(%rdx), %xmm1 4829; SSE42-NEXT: paddb 32(%rdx), %xmm0 4830; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 4831; SSE42-NEXT: movdqa %xmm2, (%rcx) 4832; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 4833; SSE42-NEXT: retq 4834; 4835; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4836; AVX: # %bb.0: 4837; AVX-NEXT: vmovdqa (%rdi), %xmm0 4838; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 4839; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 4840; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 4841; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 4842; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4843; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 4844; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4845; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] 4846; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] 4847; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 4848; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 4849; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] 4850; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 4851; AVX-NEXT: vmovdqa %xmm1, (%rcx) 4852; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) 4853; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) 4854; AVX-NEXT: retq 4855; 4856; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4857; AVX2-SLOW: # %bb.0: 4858; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 4859; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 4860; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4861; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4862; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4863; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] 4864; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 4865; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4866; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] 4867; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4868; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 4869; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] 4870; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4871; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4872; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) 4873; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) 4874; AVX2-SLOW-NEXT: vzeroupper 4875; AVX2-SLOW-NEXT: retq 4876; 4877; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4878; AVX2-FAST-PERLANE: # %bb.0: 4879; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 4880; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 4881; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4882; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4883; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4884; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] 4885; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 4886; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 4887; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] 4888; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 4889; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4890; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4891; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) 4892; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) 4893; AVX2-FAST-PERLANE-NEXT: vzeroupper 4894; AVX2-FAST-PERLANE-NEXT: retq 4895; 4896; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4897; AVX2-FAST: # %bb.0: 4898; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 4899; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 4900; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4901; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4902; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4903; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] 4904; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4905; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4906; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] 4907; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 4908; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 4909; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4910; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) 4911; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) 4912; AVX2-FAST-NEXT: vzeroupper 4913; AVX2-FAST-NEXT: retq 4914; 4915; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4916; AVX512F: # %bb.0: 4917; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 4918; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 4919; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4920; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4921; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4922; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 4923; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] 4924; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 4925; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4926; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4927; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 4928; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 4929; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 4930; AVX512F-NEXT: vzeroupper 4931; AVX512F-NEXT: retq 4932; 4933; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4934; AVX512DQ: # %bb.0: 4935; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 4936; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 4937; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 4938; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 4939; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4940; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 4941; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] 4942; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 4943; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 4944; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 4945; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 4946; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 4947; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 4948; AVX512DQ-NEXT: vzeroupper 4949; AVX512DQ-NEXT: retq 4950; 4951; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4952; AVX512BW-SLOW: # %bb.0: 4953; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 4954; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4955; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4956; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 4957; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 4958; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4959; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] 4960; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 4961; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] 4962; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4963; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4964; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 4965; AVX512BW-SLOW-NEXT: vzeroupper 4966; AVX512BW-SLOW-NEXT: retq 4967; 4968; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: 4969; AVX512BW-FAST: # %bb.0: 4970; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 4971; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 4972; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] 4973; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 4974; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 4975; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] 4976; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 4977; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 4978; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 4979; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 4980; AVX512BW-FAST-NEXT: vzeroupper 4981; AVX512BW-FAST-NEXT: retq 4982 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 4983 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 4984 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 4985 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 4986 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23> 4987 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8> 4988 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4989 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 4990 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 4991 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 4992 ret void 4993} 4994 4995define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 4996; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 4997; SSE2: # %bb.0: 4998; SSE2-NEXT: movdqa (%rdi), %xmm0 4999; SSE2-NEXT: movdqa 48(%rdi), %xmm1 5000; SSE2-NEXT: paddb (%rsi), %xmm0 5001; SSE2-NEXT: paddb 48(%rsi), %xmm1 5002; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 5003; SSE2-NEXT: xorps %xmm2, %xmm2 5004; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 5005; SSE2-NEXT: paddb (%rdx), %xmm1 5006; SSE2-NEXT: movdqa 16(%rdx), %xmm0 5007; SSE2-NEXT: paddb %xmm2, %xmm0 5008; SSE2-NEXT: paddb 32(%rdx), %xmm2 5009; SSE2-NEXT: movdqa %xmm2, 32(%rcx) 5010; SSE2-NEXT: movdqa %xmm1, (%rcx) 5011; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5012; SSE2-NEXT: retq 5013; 5014; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5015; SSE42: # %bb.0: 5016; SSE42-NEXT: movdqa (%rdi), %xmm0 5017; SSE42-NEXT: movdqa 48(%rdi), %xmm1 5018; SSE42-NEXT: paddb (%rsi), %xmm0 5019; SSE42-NEXT: paddb 48(%rsi), %xmm1 5020; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5021; SSE42-NEXT: pxor %xmm2, %xmm2 5022; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5023; SSE42-NEXT: paddb (%rdx), %xmm1 5024; SSE42-NEXT: movdqa 16(%rdx), %xmm0 5025; SSE42-NEXT: paddb %xmm2, %xmm0 5026; SSE42-NEXT: paddb 32(%rdx), %xmm2 5027; SSE42-NEXT: movdqa %xmm2, 32(%rcx) 5028; SSE42-NEXT: movdqa %xmm1, (%rcx) 5029; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5030; SSE42-NEXT: retq 5031; 5032; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5033; AVX: # %bb.0: 5034; AVX-NEXT: vmovdqa (%rdi), %xmm0 5035; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 5036; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5037; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 5038; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5039; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 5040; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5041; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5042; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 5043; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5044; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 5045; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5046; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 5047; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 5048; AVX-NEXT: retq 5049; 5050; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5051; AVX2: # %bb.0: 5052; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5053; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5054; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 5055; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 5056; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] 5057; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] 5058; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 5059; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5060; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 5061; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5062; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5063; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5064; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5065; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5066; AVX2-NEXT: vzeroupper 5067; AVX2-NEXT: retq 5068; 5069; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5070; AVX512F: # %bb.0: 5071; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5072; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 5073; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5074; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5075; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5076; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5077; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] 5078; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 5079; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5080; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5081; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5082; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 5083; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 5084; AVX512F-NEXT: vzeroupper 5085; AVX512F-NEXT: retq 5086; 5087; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5088; AVX512DQ: # %bb.0: 5089; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5090; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 5091; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5092; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5093; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5094; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 5095; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] 5096; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 5097; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5098; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5099; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5100; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 5101; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 5102; AVX512DQ-NEXT: vzeroupper 5103; AVX512DQ-NEXT: retq 5104; 5105; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5106; AVX512BW-SLOW: # %bb.0: 5107; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 5108; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23] 5109; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5110; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5111; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 5112; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5113; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5114; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 5115; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5116; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 5117; AVX512BW-SLOW-NEXT: vzeroupper 5118; AVX512BW-SLOW-NEXT: retq 5119; 5120; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: 5121; AVX512BW-FAST: # %bb.0: 5122; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 5123; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] 5124; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5125; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 5126; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 5127; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 5128; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 5129; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5130; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 5131; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5132; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 5133; AVX512BW-FAST-NEXT: vzeroupper 5134; AVX512BW-FAST-NEXT: retq 5135 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5136 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5137 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5138 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 5139 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23> 5140 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8> 5141 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5142 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5143 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5144 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5145 ret void 5146} 5147 5148define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5149; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5150; SSE2: # %bb.0: 5151; SSE2-NEXT: movdqa (%rdi), %xmm0 5152; SSE2-NEXT: movdqa 48(%rdi), %xmm1 5153; SSE2-NEXT: paddb (%rsi), %xmm0 5154; SSE2-NEXT: paddb 48(%rsi), %xmm1 5155; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 5156; SSE2-NEXT: xorps %xmm2, %xmm2 5157; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 5158; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] 5159; SSE2-NEXT: movaps 32(%rdx), %xmm2 5160; SSE2-NEXT: paddb (%rdx), %xmm1 5161; SSE2-NEXT: paddb 16(%rdx), %xmm0 5162; SSE2-NEXT: movaps %xmm2, 32(%rcx) 5163; SSE2-NEXT: movdqa %xmm1, (%rcx) 5164; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5165; SSE2-NEXT: retq 5166; 5167; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5168; SSE42: # %bb.0: 5169; SSE42-NEXT: movdqa (%rdi), %xmm0 5170; SSE42-NEXT: movdqa 48(%rdi), %xmm1 5171; SSE42-NEXT: paddb (%rsi), %xmm0 5172; SSE42-NEXT: paddb 48(%rsi), %xmm1 5173; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5174; SSE42-NEXT: pxor %xmm2, %xmm2 5175; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 5176; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] 5177; SSE42-NEXT: movaps 32(%rdx), %xmm2 5178; SSE42-NEXT: paddb (%rdx), %xmm1 5179; SSE42-NEXT: paddb 16(%rdx), %xmm0 5180; SSE42-NEXT: movaps %xmm2, 32(%rcx) 5181; SSE42-NEXT: movdqa %xmm1, (%rcx) 5182; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5183; SSE42-NEXT: retq 5184; 5185; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5186; AVX: # %bb.0: 5187; AVX-NEXT: vmovdqa (%rdi), %xmm0 5188; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 5189; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5190; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 5191; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 5192; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5193; AVX-NEXT: vmovaps 32(%rdx), %ymm2 5194; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5195; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 5196; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] 5197; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 5198; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 5199; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5200; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 5201; AVX-NEXT: vzeroupper 5202; AVX-NEXT: retq 5203; 5204; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5205; AVX2: # %bb.0: 5206; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5207; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 5208; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5209; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5210; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5211; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] 5212; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 5213; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5214; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] 5215; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5216; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5217; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5218; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5219; AVX2-NEXT: vzeroupper 5220; AVX2-NEXT: retq 5221; 5222; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5223; AVX512F: # %bb.0: 5224; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5225; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 5226; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5227; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5228; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5229; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] 5230; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 5231; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 5232; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 5233; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5234; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5235; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5236; AVX512F-NEXT: vzeroupper 5237; AVX512F-NEXT: retq 5238; 5239; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5240; AVX512DQ: # %bb.0: 5241; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5242; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 5243; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5244; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5245; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5246; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] 5247; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 5248; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 5249; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 5250; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 5251; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) 5252; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5253; AVX512DQ-NEXT: vzeroupper 5254; AVX512DQ-NEXT: retq 5255; 5256; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: 5257; AVX512BW: # %bb.0: 5258; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5259; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5260; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] 5261; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 5262; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 5263; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] 5264; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5265; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5266; AVX512BW-NEXT: vzeroupper 5267; AVX512BW-NEXT: retq 5268 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5269 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5270 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5271 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 5272 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23> 5273 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8> 5274 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5275 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5276 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5277 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5278 ret void 5279} 5280 5281define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5282; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5283; SSE2: # %bb.0: 5284; SSE2-NEXT: movdqa (%rdi), %xmm0 5285; SSE2-NEXT: movdqa 48(%rdi), %xmm1 5286; SSE2-NEXT: paddb (%rsi), %xmm0 5287; SSE2-NEXT: paddb 48(%rsi), %xmm1 5288; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 5289; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 5290; SSE2-NEXT: paddb (%rdx), %xmm1 5291; SSE2-NEXT: movdqa 16(%rdx), %xmm2 5292; SSE2-NEXT: paddb %xmm0, %xmm2 5293; SSE2-NEXT: paddb 32(%rdx), %xmm0 5294; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 5295; SSE2-NEXT: movdqa %xmm1, (%rcx) 5296; SSE2-NEXT: movdqa %xmm2, 16(%rcx) 5297; SSE2-NEXT: retq 5298; 5299; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5300; SSE42: # %bb.0: 5301; SSE42-NEXT: movdqa (%rdi), %xmm0 5302; SSE42-NEXT: movdqa 48(%rdi), %xmm1 5303; SSE42-NEXT: paddb 48(%rsi), %xmm1 5304; SSE42-NEXT: paddb (%rsi), %xmm0 5305; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 5306; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 5307; SSE42-NEXT: paddb (%rdx), %xmm1 5308; SSE42-NEXT: movdqa 16(%rdx), %xmm2 5309; SSE42-NEXT: paddb %xmm0, %xmm2 5310; SSE42-NEXT: paddb 32(%rdx), %xmm0 5311; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 5312; SSE42-NEXT: movdqa %xmm1, (%rcx) 5313; SSE42-NEXT: movdqa %xmm2, 16(%rcx) 5314; SSE42-NEXT: retq 5315; 5316; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5317; AVX: # %bb.0: 5318; AVX-NEXT: vmovdqa (%rdi), %xmm0 5319; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 5320; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 5321; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5322; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 5323; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5324; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5325; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5326; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 5327; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5328; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 5329; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5330; AVX-NEXT: retq 5331; 5332; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5333; AVX2: # %bb.0: 5334; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5335; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 5336; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5337; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5338; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 5339; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3] 5340; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 5341; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 5342; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5343; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 5344; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5345; AVX2-NEXT: vmovdqa %ymm1, (%rcx) 5346; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) 5347; AVX2-NEXT: vzeroupper 5348; AVX2-NEXT: retq 5349; 5350; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5351; AVX512F: # %bb.0: 5352; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5353; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 5354; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5355; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5356; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5357; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5358; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] 5359; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 5360; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5361; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5362; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5363; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 5364; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 5365; AVX512F-NEXT: vzeroupper 5366; AVX512F-NEXT: retq 5367; 5368; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5369; AVX512DQ: # %bb.0: 5370; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5371; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 5372; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5373; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5374; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5375; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 5376; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] 5377; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 5378; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 5379; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 5380; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 5381; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 5382; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 5383; AVX512DQ-NEXT: vzeroupper 5384; AVX512DQ-NEXT: retq 5385; 5386; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5387; AVX512BW-SLOW: # %bb.0: 5388; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 5389; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11] 5390; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5391; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5392; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 5393; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5394; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 5395; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5396; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 5397; AVX512BW-SLOW-NEXT: vzeroupper 5398; AVX512BW-SLOW-NEXT: retq 5399; 5400; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: 5401; AVX512BW-FAST: # %bb.0: 5402; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 5403; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] 5404; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5405; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 5406; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 5407; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 5408; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5409; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 5410; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5411; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 5412; AVX512BW-FAST-NEXT: vzeroupper 5413; AVX512BW-FAST-NEXT: retq 5414 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5415 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5416 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5417 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 5418 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11> 5419 %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8> 5420 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5421 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5422 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5423 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5424 ret void 5425} 5426 5427define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5428; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5429; SSE2: # %bb.0: 5430; SSE2-NEXT: movdqa (%rdi), %xmm0 5431; SSE2-NEXT: movdqa 48(%rdi), %xmm1 5432; SSE2-NEXT: paddb (%rsi), %xmm0 5433; SSE2-NEXT: paddb 48(%rsi), %xmm1 5434; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 5435; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 5436; SSE2-NEXT: movaps 32(%rdx), %xmm2 5437; SSE2-NEXT: paddb (%rdx), %xmm1 5438; SSE2-NEXT: paddb 16(%rdx), %xmm0 5439; SSE2-NEXT: movaps %xmm2, 32(%rcx) 5440; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 5441; SSE2-NEXT: movdqa %xmm1, (%rcx) 5442; SSE2-NEXT: retq 5443; 5444; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5445; SSE42: # %bb.0: 5446; SSE42-NEXT: movdqa (%rdi), %xmm0 5447; SSE42-NEXT: movdqa 48(%rdi), %xmm1 5448; SSE42-NEXT: paddb 48(%rsi), %xmm1 5449; SSE42-NEXT: paddb (%rsi), %xmm0 5450; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 5451; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 5452; SSE42-NEXT: movaps 32(%rdx), %xmm2 5453; SSE42-NEXT: paddb (%rdx), %xmm1 5454; SSE42-NEXT: paddb 16(%rdx), %xmm0 5455; SSE42-NEXT: movaps %xmm2, 32(%rcx) 5456; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 5457; SSE42-NEXT: movdqa %xmm1, (%rcx) 5458; SSE42-NEXT: retq 5459; 5460; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5461; AVX: # %bb.0: 5462; AVX-NEXT: vmovdqa (%rdi), %xmm0 5463; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 5464; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 5465; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5466; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 5467; AVX-NEXT: vmovaps 32(%rdx), %ymm2 5468; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 5469; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 5470; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 5471; AVX-NEXT: vmovaps %ymm2, 32(%rcx) 5472; AVX-NEXT: vmovdqa %xmm1, (%rcx) 5473; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 5474; AVX-NEXT: vzeroupper 5475; AVX-NEXT: retq 5476; 5477; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5478; AVX2: # %bb.0: 5479; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5480; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 5481; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5482; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5483; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 5484; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0] 5485; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5486; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 5487; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 5488; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5489; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 5490; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5491; AVX2-NEXT: vzeroupper 5492; AVX2-NEXT: retq 5493; 5494; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5495; AVX512F: # %bb.0: 5496; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5497; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 5498; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5499; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5500; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5501; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5502; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] 5503; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 5504; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 5505; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 5506; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) 5507; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5508; AVX512F-NEXT: vzeroupper 5509; AVX512F-NEXT: retq 5510; 5511; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5512; AVX512DQ: # %bb.0: 5513; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5514; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 5515; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 5516; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5517; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5518; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 5519; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] 5520; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 5521; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 5522; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 5523; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) 5524; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5525; AVX512DQ-NEXT: vzeroupper 5526; AVX512DQ-NEXT: retq 5527; 5528; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5529; AVX512BW-SLOW: # %bb.0: 5530; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 5531; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0] 5532; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5533; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5534; AVX512BW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 5535; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, %ymm0 5536; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5537; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) 5538; AVX512BW-SLOW-NEXT: vzeroupper 5539; AVX512BW-SLOW-NEXT: retq 5540; 5541; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: 5542; AVX512BW-FAST: # %bb.0: 5543; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 5544; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0] 5545; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5546; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 5547; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 5548; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 5549; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5550; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) 5551; AVX512BW-FAST-NEXT: vzeroupper 5552; AVX512BW-FAST-NEXT: retq 5553 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5554 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5555 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5556 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 5557 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11> 5558 %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8> 5559 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5560 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5561 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias 5562 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5563 ret void 5564} 5565 5566define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5567; SSE2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5568; SSE2: # %bb.0: 5569; SSE2-NEXT: movdqa (%rdi), %xmm0 5570; SSE2-NEXT: paddb (%rsi), %xmm0 5571; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5572; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5573; SSE2-NEXT: pxor %xmm1, %xmm1 5574; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5575; SSE2-NEXT: movdqa 16(%rdx), %xmm1 5576; SSE2-NEXT: paddb %xmm0, %xmm1 5577; SSE2-NEXT: movdqa (%rdx), %xmm2 5578; SSE2-NEXT: paddb %xmm0, %xmm2 5579; SSE2-NEXT: movdqa 48(%rdx), %xmm3 5580; SSE2-NEXT: paddb %xmm0, %xmm3 5581; SSE2-NEXT: paddb 32(%rdx), %xmm0 5582; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 5583; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 5584; SSE2-NEXT: movdqa %xmm2, (%rcx) 5585; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 5586; SSE2-NEXT: retq 5587; 5588; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5589; SSE42: # %bb.0: 5590; SSE42-NEXT: movdqa (%rdi), %xmm0 5591; SSE42-NEXT: paddb (%rsi), %xmm0 5592; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero 5593; SSE42-NEXT: movdqa 16(%rdx), %xmm1 5594; SSE42-NEXT: paddb %xmm0, %xmm1 5595; SSE42-NEXT: movdqa (%rdx), %xmm2 5596; SSE42-NEXT: paddb %xmm0, %xmm2 5597; SSE42-NEXT: movdqa 48(%rdx), %xmm3 5598; SSE42-NEXT: paddb %xmm0, %xmm3 5599; SSE42-NEXT: paddb 32(%rdx), %xmm0 5600; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 5601; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 5602; SSE42-NEXT: movdqa %xmm2, (%rcx) 5603; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 5604; SSE42-NEXT: retq 5605; 5606; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5607; AVX: # %bb.0: 5608; AVX-NEXT: vmovdqa (%rdi), %xmm0 5609; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5610; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero 5611; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5612; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5613; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5614; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5615; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5616; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5617; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5618; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5619; AVX-NEXT: retq 5620; 5621; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5622; AVX2: # %bb.0: 5623; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5624; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5625; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5626; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero 5627; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5628; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5629; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5630; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5631; AVX2-NEXT: vzeroupper 5632; AVX2-NEXT: retq 5633; 5634; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5635; AVX512F: # %bb.0: 5636; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5637; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5638; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5639; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero 5640; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5641; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5642; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5643; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5644; AVX512F-NEXT: vzeroupper 5645; AVX512F-NEXT: retq 5646; 5647; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5648; AVX512DQ: # %bb.0: 5649; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5650; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5651; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5652; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero 5653; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5654; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5655; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5656; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5657; AVX512DQ-NEXT: vzeroupper 5658; AVX512DQ-NEXT: retq 5659; 5660; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: 5661; AVX512BW: # %bb.0: 5662; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5663; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5664; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 5665; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero 5666; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5667; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5668; AVX512BW-NEXT: vzeroupper 5669; AVX512BW-NEXT: retq 5670 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5671 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5672 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5673 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127> 5674 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5675 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias 5676 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5677 ret void 5678} 5679 5680define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5681; SSE2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5682; SSE2: # %bb.0: 5683; SSE2-NEXT: movdqa (%rdi), %xmm0 5684; SSE2-NEXT: paddb (%rsi), %xmm0 5685; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5686; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5687; SSE2-NEXT: movdqa 16(%rdx), %xmm1 5688; SSE2-NEXT: paddb %xmm0, %xmm1 5689; SSE2-NEXT: movdqa (%rdx), %xmm2 5690; SSE2-NEXT: paddb %xmm0, %xmm2 5691; SSE2-NEXT: movdqa 48(%rdx), %xmm3 5692; SSE2-NEXT: paddb %xmm0, %xmm3 5693; SSE2-NEXT: paddb 32(%rdx), %xmm0 5694; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 5695; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 5696; SSE2-NEXT: movdqa %xmm2, (%rcx) 5697; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 5698; SSE2-NEXT: retq 5699; 5700; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5701; SSE42: # %bb.0: 5702; SSE42-NEXT: movdqa (%rdi), %xmm0 5703; SSE42-NEXT: paddb (%rsi), %xmm0 5704; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 5705; SSE42-NEXT: movdqa 16(%rdx), %xmm1 5706; SSE42-NEXT: paddb %xmm0, %xmm1 5707; SSE42-NEXT: movdqa (%rdx), %xmm2 5708; SSE42-NEXT: paddb %xmm0, %xmm2 5709; SSE42-NEXT: movdqa 48(%rdx), %xmm3 5710; SSE42-NEXT: paddb %xmm0, %xmm3 5711; SSE42-NEXT: paddb 32(%rdx), %xmm0 5712; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 5713; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 5714; SSE42-NEXT: movdqa %xmm2, (%rcx) 5715; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 5716; SSE42-NEXT: retq 5717; 5718; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5719; AVX: # %bb.0: 5720; AVX-NEXT: vmovdqa (%rdi), %xmm0 5721; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5722; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 5723; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5724; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5725; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5726; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5727; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5728; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5729; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5730; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5731; AVX-NEXT: retq 5732; 5733; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5734; AVX2: # %bb.0: 5735; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5736; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5737; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5738; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero 5739; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5740; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5741; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5742; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5743; AVX2-NEXT: vzeroupper 5744; AVX2-NEXT: retq 5745; 5746; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5747; AVX512F: # %bb.0: 5748; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5749; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5750; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5751; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero 5752; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5753; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5754; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5755; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5756; AVX512F-NEXT: vzeroupper 5757; AVX512F-NEXT: retq 5758; 5759; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5760; AVX512DQ: # %bb.0: 5761; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5762; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5763; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5764; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero 5765; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5766; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5767; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5768; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5769; AVX512DQ-NEXT: vzeroupper 5770; AVX512DQ-NEXT: retq 5771; 5772; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: 5773; AVX512BW: # %bb.0: 5774; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5775; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5776; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 5777; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero 5778; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5779; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5780; AVX512BW-NEXT: vzeroupper 5781; AVX512BW-NEXT: retq 5782 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5783 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5784 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5785 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127> 5786 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5787 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias 5788 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5789 ret void 5790} 5791 5792define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5793; SSE2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5794; SSE2: # %bb.0: 5795; SSE2-NEXT: movdqa (%rdi), %xmm0 5796; SSE2-NEXT: paddb (%rsi), %xmm0 5797; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5798; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5799; SSE2-NEXT: movdqa 16(%rdx), %xmm1 5800; SSE2-NEXT: paddb %xmm0, %xmm1 5801; SSE2-NEXT: movdqa (%rdx), %xmm2 5802; SSE2-NEXT: paddb %xmm0, %xmm2 5803; SSE2-NEXT: movdqa 48(%rdx), %xmm3 5804; SSE2-NEXT: paddb %xmm0, %xmm3 5805; SSE2-NEXT: paddb 32(%rdx), %xmm0 5806; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 5807; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 5808; SSE2-NEXT: movdqa %xmm2, (%rcx) 5809; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 5810; SSE2-NEXT: retq 5811; 5812; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5813; SSE42: # %bb.0: 5814; SSE42-NEXT: movdqa (%rdi), %xmm0 5815; SSE42-NEXT: paddb (%rsi), %xmm0 5816; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 5817; SSE42-NEXT: movdqa 16(%rdx), %xmm1 5818; SSE42-NEXT: paddb %xmm0, %xmm1 5819; SSE42-NEXT: movdqa (%rdx), %xmm2 5820; SSE42-NEXT: paddb %xmm0, %xmm2 5821; SSE42-NEXT: movdqa 48(%rdx), %xmm3 5822; SSE42-NEXT: paddb %xmm0, %xmm3 5823; SSE42-NEXT: paddb 32(%rdx), %xmm0 5824; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 5825; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 5826; SSE42-NEXT: movdqa %xmm2, (%rcx) 5827; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 5828; SSE42-NEXT: retq 5829; 5830; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5831; AVX: # %bb.0: 5832; AVX-NEXT: vmovdqa (%rdi), %xmm0 5833; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5834; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero 5835; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5836; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5837; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5838; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5839; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5840; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5841; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5842; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5843; AVX-NEXT: retq 5844; 5845; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5846; AVX2: # %bb.0: 5847; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5848; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5849; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5850; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero 5851; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5852; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5853; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5854; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5855; AVX2-NEXT: vzeroupper 5856; AVX2-NEXT: retq 5857; 5858; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5859; AVX512F: # %bb.0: 5860; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5861; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5862; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5863; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero 5864; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5865; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5866; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5867; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5868; AVX512F-NEXT: vzeroupper 5869; AVX512F-NEXT: retq 5870; 5871; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5872; AVX512DQ: # %bb.0: 5873; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5874; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5875; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5876; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero 5877; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5878; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5879; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5880; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5881; AVX512DQ-NEXT: vzeroupper 5882; AVX512DQ-NEXT: retq 5883; 5884; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: 5885; AVX512BW: # %bb.0: 5886; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5887; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5888; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 5889; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero 5890; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5891; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5892; AVX512BW-NEXT: vzeroupper 5893; AVX512BW-NEXT: retq 5894 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5895 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5896 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5897 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5898 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5899 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias 5900 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5901 ret void 5902} 5903 5904define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5905; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5906; SSE: # %bb.0: 5907; SSE-NEXT: movdqa (%rdi), %xmm0 5908; SSE-NEXT: paddb (%rsi), %xmm0 5909; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5910; SSE-NEXT: movdqa 16(%rdx), %xmm1 5911; SSE-NEXT: paddb %xmm0, %xmm1 5912; SSE-NEXT: movdqa (%rdx), %xmm2 5913; SSE-NEXT: paddb %xmm0, %xmm2 5914; SSE-NEXT: movdqa 48(%rdx), %xmm3 5915; SSE-NEXT: paddb %xmm0, %xmm3 5916; SSE-NEXT: paddb 32(%rdx), %xmm0 5917; SSE-NEXT: movdqa %xmm0, 32(%rcx) 5918; SSE-NEXT: movdqa %xmm3, 48(%rcx) 5919; SSE-NEXT: movdqa %xmm2, (%rcx) 5920; SSE-NEXT: movdqa %xmm1, 16(%rcx) 5921; SSE-NEXT: retq 5922; 5923; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5924; AVX: # %bb.0: 5925; AVX-NEXT: vmovdqa (%rdi), %xmm0 5926; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 5927; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5928; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 5929; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 5930; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 5931; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 5932; AVX-NEXT: vmovdqa %xmm0, (%rcx) 5933; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 5934; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 5935; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 5936; AVX-NEXT: retq 5937; 5938; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5939; AVX2: # %bb.0: 5940; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5941; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5942; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5943; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 5944; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5945; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5946; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5947; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5948; AVX2-NEXT: vzeroupper 5949; AVX2-NEXT: retq 5950; 5951; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5952; AVX512F: # %bb.0: 5953; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5954; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5955; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5956; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 5957; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5958; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5959; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 5960; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 5961; AVX512F-NEXT: vzeroupper 5962; AVX512F-NEXT: retq 5963; 5964; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5965; AVX512DQ: # %bb.0: 5966; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 5967; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 5968; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 5969; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 5970; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 5971; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 5972; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 5973; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 5974; AVX512DQ-NEXT: vzeroupper 5975; AVX512DQ-NEXT: retq 5976; 5977; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: 5978; AVX512BW: # %bb.0: 5979; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 5980; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 5981; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 5982; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 5983; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 5984; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 5985; AVX512BW-NEXT: vzeroupper 5986; AVX512BW-NEXT: retq 5987 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 5988 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 5989 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 5990 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5991 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 5992 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias 5993 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 5994 ret void 5995} 5996 5997define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 5998; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 5999; SSE: # %bb.0: 6000; SSE-NEXT: movdqa (%rdi), %xmm0 6001; SSE-NEXT: paddb (%rsi), %xmm0 6002; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6003; SSE-NEXT: movaps 16(%rdx), %xmm1 6004; SSE-NEXT: movaps 48(%rdx), %xmm2 6005; SSE-NEXT: movdqa (%rdx), %xmm3 6006; SSE-NEXT: paddb %xmm0, %xmm3 6007; SSE-NEXT: paddb 32(%rdx), %xmm0 6008; SSE-NEXT: movaps %xmm2, 48(%rcx) 6009; SSE-NEXT: movaps %xmm1, 16(%rcx) 6010; SSE-NEXT: movdqa %xmm0, 32(%rcx) 6011; SSE-NEXT: movdqa %xmm3, (%rcx) 6012; SSE-NEXT: retq 6013; 6014; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 6015; AVX: # %bb.0: 6016; AVX-NEXT: vmovdqa (%rdi), %xmm0 6017; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6018; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 6019; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 6020; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6021; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6022; AVX-NEXT: vmovaps 48(%rdx), %xmm3 6023; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6024; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 6025; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6026; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 6027; AVX-NEXT: retq 6028; 6029; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 6030; AVX2: # %bb.0: 6031; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6032; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6033; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6034; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 6035; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6036; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6037; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6038; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6039; AVX2-NEXT: vzeroupper 6040; AVX2-NEXT: retq 6041; 6042; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 6043; AVX512F: # %bb.0: 6044; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6045; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6046; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6047; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 6048; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6049; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6050; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6051; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6052; AVX512F-NEXT: vzeroupper 6053; AVX512F-NEXT: retq 6054; 6055; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 6056; AVX512DQ: # %bb.0: 6057; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6058; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6059; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] 6060; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 6061; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6062; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6063; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 6064; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 6065; AVX512DQ-NEXT: vzeroupper 6066; AVX512DQ-NEXT: retq 6067; 6068; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: 6069; AVX512BW: # %bb.0: 6070; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 6071; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6072; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 6073; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 6074; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6075; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6076; AVX512BW-NEXT: vzeroupper 6077; AVX512BW-NEXT: retq 6078 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6079 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6080 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6081 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6082 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6083 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias 6084 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6085 ret void 6086} 6087 6088define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6089; SSE2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6090; SSE2: # %bb.0: 6091; SSE2-NEXT: movdqa (%rdi), %xmm0 6092; SSE2-NEXT: paddb (%rsi), %xmm0 6093; SSE2-NEXT: pxor %xmm1, %xmm1 6094; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 6095; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6096; SSE2-NEXT: movdqa 16(%rdx), %xmm1 6097; SSE2-NEXT: paddb %xmm0, %xmm1 6098; SSE2-NEXT: movdqa (%rdx), %xmm2 6099; SSE2-NEXT: paddb %xmm0, %xmm2 6100; SSE2-NEXT: movdqa 48(%rdx), %xmm3 6101; SSE2-NEXT: paddb %xmm0, %xmm3 6102; SSE2-NEXT: paddb 32(%rdx), %xmm0 6103; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6104; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6105; SSE2-NEXT: movdqa %xmm2, (%rcx) 6106; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 6107; SSE2-NEXT: retq 6108; 6109; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6110; SSE42: # %bb.0: 6111; SSE42-NEXT: movdqa (%rdi), %xmm0 6112; SSE42-NEXT: paddb (%rsi), %xmm0 6113; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 6114; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 6115; SSE42-NEXT: movdqa 16(%rdx), %xmm1 6116; SSE42-NEXT: paddb %xmm0, %xmm1 6117; SSE42-NEXT: movdqa (%rdx), %xmm2 6118; SSE42-NEXT: paddb %xmm0, %xmm2 6119; SSE42-NEXT: movdqa 48(%rdx), %xmm3 6120; SSE42-NEXT: paddb %xmm0, %xmm3 6121; SSE42-NEXT: paddb 32(%rdx), %xmm0 6122; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6123; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6124; SSE42-NEXT: movdqa %xmm2, (%rcx) 6125; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 6126; SSE42-NEXT: retq 6127; 6128; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6129; AVX: # %bb.0: 6130; AVX-NEXT: vmovdqa (%rdi), %xmm0 6131; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6132; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 6133; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 6134; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 6135; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 6136; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 6137; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6138; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6139; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 6140; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 6141; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 6142; AVX-NEXT: retq 6143; 6144; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6145; AVX2: # %bb.0: 6146; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6147; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6148; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6149; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero 6150; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6151; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6152; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6153; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6154; AVX2-NEXT: vzeroupper 6155; AVX2-NEXT: retq 6156; 6157; AVX512F-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6158; AVX512F: # %bb.0: 6159; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6160; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6161; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6162; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero 6163; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6164; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6165; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6166; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6167; AVX512F-NEXT: vzeroupper 6168; AVX512F-NEXT: retq 6169; 6170; AVX512DQ-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6171; AVX512DQ: # %bb.0: 6172; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6173; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6174; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6175; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero 6176; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6177; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6178; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 6179; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 6180; AVX512DQ-NEXT: vzeroupper 6181; AVX512DQ-NEXT: retq 6182; 6183; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: 6184; AVX512BW: # %bb.0: 6185; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6186; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6187; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6188; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] 6189; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6190; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6191; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6192; AVX512BW-NEXT: vzeroupper 6193; AVX512BW-NEXT: retq 6194 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6195 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6196 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6197 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6198 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63> 6199 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8> 6200 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6201 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6202 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6203 ret void 6204} 6205 6206define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6207; SSE2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6208; SSE2: # %bb.0: 6209; SSE2-NEXT: movdqa (%rdi), %xmm0 6210; SSE2-NEXT: paddb (%rsi), %xmm0 6211; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6212; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6213; SSE2-NEXT: movdqa 16(%rdx), %xmm1 6214; SSE2-NEXT: paddb %xmm0, %xmm1 6215; SSE2-NEXT: movdqa (%rdx), %xmm2 6216; SSE2-NEXT: paddb %xmm0, %xmm2 6217; SSE2-NEXT: movdqa 48(%rdx), %xmm3 6218; SSE2-NEXT: paddb %xmm0, %xmm3 6219; SSE2-NEXT: paddb 32(%rdx), %xmm0 6220; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6221; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6222; SSE2-NEXT: movdqa %xmm2, (%rcx) 6223; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 6224; SSE2-NEXT: retq 6225; 6226; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6227; SSE42: # %bb.0: 6228; SSE42-NEXT: movdqa (%rdi), %xmm0 6229; SSE42-NEXT: paddb (%rsi), %xmm0 6230; SSE42-NEXT: pxor %xmm1, %xmm1 6231; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6232; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 6233; SSE42-NEXT: movdqa 16(%rdx), %xmm1 6234; SSE42-NEXT: paddb %xmm0, %xmm1 6235; SSE42-NEXT: movdqa (%rdx), %xmm2 6236; SSE42-NEXT: paddb %xmm0, %xmm2 6237; SSE42-NEXT: movdqa 48(%rdx), %xmm3 6238; SSE42-NEXT: paddb %xmm0, %xmm3 6239; SSE42-NEXT: paddb 32(%rdx), %xmm0 6240; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6241; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6242; SSE42-NEXT: movdqa %xmm2, (%rcx) 6243; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 6244; SSE42-NEXT: retq 6245; 6246; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6247; AVX: # %bb.0: 6248; AVX-NEXT: vmovdqa (%rdi), %xmm0 6249; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6250; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6251; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 6252; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] 6253; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 6254; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 6255; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 6256; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6257; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6258; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 6259; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 6260; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 6261; AVX-NEXT: retq 6262; 6263; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6264; AVX2: # %bb.0: 6265; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6266; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6267; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6268; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero 6269; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6270; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6271; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6272; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6273; AVX2-NEXT: vzeroupper 6274; AVX2-NEXT: retq 6275; 6276; AVX512F-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6277; AVX512F: # %bb.0: 6278; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6279; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6280; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6281; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero 6282; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6283; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6284; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6285; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6286; AVX512F-NEXT: vzeroupper 6287; AVX512F-NEXT: retq 6288; 6289; AVX512DQ-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6290; AVX512DQ: # %bb.0: 6291; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6292; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6293; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6294; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero 6295; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6296; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6297; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 6298; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 6299; AVX512DQ-NEXT: vzeroupper 6300; AVX512DQ-NEXT: retq 6301; 6302; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: 6303; AVX512BW: # %bb.0: 6304; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6305; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6306; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6307; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63] 6308; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6309; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6310; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6311; AVX512BW-NEXT: vzeroupper 6312; AVX512BW-NEXT: retq 6313 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6314 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6315 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6316 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6317 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63> 6318 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8> 6319 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6320 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6321 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6322 ret void 6323} 6324 6325define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6326; SSE2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6327; SSE2: # %bb.0: 6328; SSE2-NEXT: movdqa (%rdi), %xmm0 6329; SSE2-NEXT: paddb (%rsi), %xmm0 6330; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6331; SSE2-NEXT: movdqa 16(%rdx), %xmm1 6332; SSE2-NEXT: paddb %xmm0, %xmm1 6333; SSE2-NEXT: movdqa (%rdx), %xmm2 6334; SSE2-NEXT: paddb %xmm0, %xmm2 6335; SSE2-NEXT: movdqa 48(%rdx), %xmm3 6336; SSE2-NEXT: paddb %xmm0, %xmm3 6337; SSE2-NEXT: paddb 32(%rdx), %xmm0 6338; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6339; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6340; SSE2-NEXT: movdqa %xmm2, (%rcx) 6341; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 6342; SSE2-NEXT: retq 6343; 6344; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6345; SSE42: # %bb.0: 6346; SSE42-NEXT: movdqa (%rdi), %xmm0 6347; SSE42-NEXT: paddb (%rsi), %xmm0 6348; SSE42-NEXT: pxor %xmm1, %xmm1 6349; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6350; SSE42-NEXT: movdqa 16(%rdx), %xmm0 6351; SSE42-NEXT: paddb %xmm1, %xmm0 6352; SSE42-NEXT: movdqa (%rdx), %xmm2 6353; SSE42-NEXT: paddb %xmm1, %xmm2 6354; SSE42-NEXT: movdqa 48(%rdx), %xmm3 6355; SSE42-NEXT: paddb %xmm1, %xmm3 6356; SSE42-NEXT: paddb 32(%rdx), %xmm1 6357; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 6358; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6359; SSE42-NEXT: movdqa %xmm2, (%rcx) 6360; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 6361; SSE42-NEXT: retq 6362; 6363; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6364; AVX: # %bb.0: 6365; AVX-NEXT: vmovdqa (%rdi), %xmm0 6366; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6367; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 6368; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6369; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 6370; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 6371; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 6372; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6373; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6374; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) 6375; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 6376; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) 6377; AVX-NEXT: retq 6378; 6379; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6380; AVX2: # %bb.0: 6381; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6382; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6383; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 6384; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6385; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 6386; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6387; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6388; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6389; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6390; AVX2-NEXT: vzeroupper 6391; AVX2-NEXT: retq 6392; 6393; AVX512F-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6394; AVX512F: # %bb.0: 6395; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6396; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6397; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6398; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 6399; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 6400; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6401; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6402; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6403; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6404; AVX512F-NEXT: vzeroupper 6405; AVX512F-NEXT: retq 6406; 6407; AVX512DQ-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6408; AVX512DQ: # %bb.0: 6409; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6410; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6411; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6412; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 6413; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 6414; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6415; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6416; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 6417; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 6418; AVX512DQ-NEXT: vzeroupper 6419; AVX512DQ-NEXT: retq 6420; 6421; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: 6422; AVX512BW: # %bb.0: 6423; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6424; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6425; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6426; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63] 6427; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6428; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6429; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6430; AVX512BW-NEXT: vzeroupper 6431; AVX512BW-NEXT: retq 6432 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6433 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6434 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6435 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6436 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 6437 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8> 6438 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6439 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6440 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6441 ret void 6442} 6443 6444define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6445; SSE2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6446; SSE2: # %bb.0: 6447; SSE2-NEXT: movdqa (%rdi), %xmm0 6448; SSE2-NEXT: paddb (%rsi), %xmm0 6449; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6450; SSE2-NEXT: movaps 16(%rdx), %xmm1 6451; SSE2-NEXT: movaps 48(%rdx), %xmm2 6452; SSE2-NEXT: movdqa (%rdx), %xmm3 6453; SSE2-NEXT: paddb %xmm0, %xmm3 6454; SSE2-NEXT: paddb 32(%rdx), %xmm0 6455; SSE2-NEXT: movaps %xmm2, 48(%rcx) 6456; SSE2-NEXT: movaps %xmm1, 16(%rcx) 6457; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6458; SSE2-NEXT: movdqa %xmm3, (%rcx) 6459; SSE2-NEXT: retq 6460; 6461; SSE42-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6462; SSE42: # %bb.0: 6463; SSE42-NEXT: movdqa (%rdi), %xmm0 6464; SSE42-NEXT: paddb (%rsi), %xmm0 6465; SSE42-NEXT: pxor %xmm1, %xmm1 6466; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6467; SSE42-NEXT: movaps 16(%rdx), %xmm0 6468; SSE42-NEXT: movaps 48(%rdx), %xmm2 6469; SSE42-NEXT: movdqa (%rdx), %xmm3 6470; SSE42-NEXT: paddb %xmm1, %xmm3 6471; SSE42-NEXT: paddb 32(%rdx), %xmm1 6472; SSE42-NEXT: movaps %xmm2, 48(%rcx) 6473; SSE42-NEXT: movaps %xmm0, 16(%rcx) 6474; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 6475; SSE42-NEXT: movdqa %xmm3, (%rcx) 6476; SSE42-NEXT: retq 6477; 6478; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6479; AVX: # %bb.0: 6480; AVX-NEXT: vmovdqa (%rdi), %xmm0 6481; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6482; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 6483; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 6484; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 6485; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6486; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6487; AVX-NEXT: vmovaps 48(%rdx), %xmm3 6488; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6489; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 6490; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6491; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 6492; AVX-NEXT: retq 6493; 6494; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6495; AVX2: # %bb.0: 6496; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6497; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6498; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6499; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 6500; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6501; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6502; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6503; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6504; AVX2-NEXT: vzeroupper 6505; AVX2-NEXT: retq 6506; 6507; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6508; AVX512F: # %bb.0: 6509; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6510; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6511; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6512; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 6513; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6514; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6515; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) 6516; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) 6517; AVX512F-NEXT: vzeroupper 6518; AVX512F-NEXT: retq 6519; 6520; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6521; AVX512DQ: # %bb.0: 6522; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6523; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6524; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 6525; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 6526; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6527; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6528; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 6529; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 6530; AVX512DQ-NEXT: vzeroupper 6531; AVX512DQ-NEXT: retq 6532; 6533; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: 6534; AVX512BW: # %bb.0: 6535; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 6536; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6537; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 6538; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 6539; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6540; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6541; AVX512BW-NEXT: vzeroupper 6542; AVX512BW-NEXT: retq 6543 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6544 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6545 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6546 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> 6547 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 6548 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8> 6549 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6550 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6551 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6552 ret void 6553} 6554 6555define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6556; SSE2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6557; SSE2: # %bb.0: 6558; SSE2-NEXT: movdqa (%rdi), %xmm0 6559; SSE2-NEXT: paddb (%rsi), %xmm0 6560; SSE2-NEXT: pxor %xmm1, %xmm1 6561; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 6562; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 6563; SSE2-NEXT: movdqa 16(%rdx), %xmm1 6564; SSE2-NEXT: paddb %xmm0, %xmm1 6565; SSE2-NEXT: movdqa (%rdx), %xmm2 6566; SSE2-NEXT: paddb %xmm0, %xmm2 6567; SSE2-NEXT: movdqa 48(%rdx), %xmm3 6568; SSE2-NEXT: paddb %xmm0, %xmm3 6569; SSE2-NEXT: paddb 32(%rdx), %xmm0 6570; SSE2-NEXT: movdqa %xmm0, 32(%rcx) 6571; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6572; SSE2-NEXT: movdqa %xmm2, (%rcx) 6573; SSE2-NEXT: movdqa %xmm1, 16(%rcx) 6574; SSE2-NEXT: retq 6575; 6576; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6577; SSE42: # %bb.0: 6578; SSE42-NEXT: movdqa (%rdi), %xmm0 6579; SSE42-NEXT: paddb (%rsi), %xmm0 6580; SSE42-NEXT: pxor %xmm1, %xmm1 6581; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6582; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 6583; SSE42-NEXT: movdqa 16(%rdx), %xmm1 6584; SSE42-NEXT: paddb %xmm0, %xmm1 6585; SSE42-NEXT: movdqa (%rdx), %xmm2 6586; SSE42-NEXT: paddb %xmm0, %xmm2 6587; SSE42-NEXT: movdqa 48(%rdx), %xmm3 6588; SSE42-NEXT: paddb %xmm0, %xmm3 6589; SSE42-NEXT: paddb 32(%rdx), %xmm0 6590; SSE42-NEXT: movdqa %xmm0, 32(%rcx) 6591; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6592; SSE42-NEXT: movdqa %xmm2, (%rcx) 6593; SSE42-NEXT: movdqa %xmm1, 16(%rcx) 6594; SSE42-NEXT: retq 6595; 6596; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6597; AVX: # %bb.0: 6598; AVX-NEXT: vmovdqa (%rdi), %xmm0 6599; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6600; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 6601; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6602; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] 6603; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 6604; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 6605; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 6606; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 6607; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 6608; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6609; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6610; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 6611; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6612; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 6613; AVX-NEXT: vzeroupper 6614; AVX-NEXT: retq 6615; 6616; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6617; AVX2: # %bb.0: 6618; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 6619; AVX2-NEXT: vmovdqa (%rdi), %xmm1 6620; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 6621; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6622; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6623; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6624; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6625; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6626; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6627; AVX2-NEXT: vzeroupper 6628; AVX2-NEXT: retq 6629; 6630; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6631; AVX512F: # %bb.0: 6632; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6633; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6634; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 6635; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] 6636; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 6637; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6638; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6639; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6640; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6641; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6642; AVX512F-NEXT: vzeroupper 6643; AVX512F-NEXT: retq 6644; 6645; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6646; AVX512DQ: # %bb.0: 6647; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6648; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6649; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 6650; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] 6651; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 6652; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6653; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6654; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6655; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 6656; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 6657; AVX512DQ-NEXT: vzeroupper 6658; AVX512DQ-NEXT: retq 6659; 6660; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: 6661; AVX512BW: # %bb.0: 6662; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6663; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6664; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6665; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] 6666; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 6667; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6668; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6669; AVX512BW-NEXT: vzeroupper 6670; AVX512BW-NEXT: retq 6671 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6672 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6673 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6674 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 6675 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31> 6676 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8> 6677 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6678 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6679 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6680 ret void 6681} 6682 6683define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6684; SSE2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6685; SSE2: # %bb.0: 6686; SSE2-NEXT: movdqa (%rdi), %xmm0 6687; SSE2-NEXT: paddb (%rsi), %xmm0 6688; SSE2-NEXT: xorps %xmm1, %xmm1 6689; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 6690; SSE2-NEXT: movdqa 16(%rdx), %xmm0 6691; SSE2-NEXT: paddb %xmm1, %xmm0 6692; SSE2-NEXT: movdqa (%rdx), %xmm2 6693; SSE2-NEXT: paddb %xmm1, %xmm2 6694; SSE2-NEXT: movdqa 48(%rdx), %xmm3 6695; SSE2-NEXT: paddb %xmm1, %xmm3 6696; SSE2-NEXT: paddb 32(%rdx), %xmm1 6697; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 6698; SSE2-NEXT: movdqa %xmm3, 48(%rcx) 6699; SSE2-NEXT: movdqa %xmm2, (%rcx) 6700; SSE2-NEXT: movdqa %xmm0, 16(%rcx) 6701; SSE2-NEXT: retq 6702; 6703; SSE42-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6704; SSE42: # %bb.0: 6705; SSE42-NEXT: movdqa (%rdi), %xmm0 6706; SSE42-NEXT: paddb (%rsi), %xmm0 6707; SSE42-NEXT: pxor %xmm1, %xmm1 6708; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 6709; SSE42-NEXT: movdqa 16(%rdx), %xmm0 6710; SSE42-NEXT: paddb %xmm1, %xmm0 6711; SSE42-NEXT: movdqa (%rdx), %xmm2 6712; SSE42-NEXT: paddb %xmm1, %xmm2 6713; SSE42-NEXT: movdqa 48(%rdx), %xmm3 6714; SSE42-NEXT: paddb %xmm1, %xmm3 6715; SSE42-NEXT: paddb 32(%rdx), %xmm1 6716; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 6717; SSE42-NEXT: movdqa %xmm3, 48(%rcx) 6718; SSE42-NEXT: movdqa %xmm2, (%rcx) 6719; SSE42-NEXT: movdqa %xmm0, 16(%rcx) 6720; SSE42-NEXT: retq 6721; 6722; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6723; AVX: # %bb.0: 6724; AVX-NEXT: vmovdqa (%rdi), %xmm0 6725; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6726; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 6727; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6728; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 6729; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 6730; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 6731; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 6732; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 6733; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6734; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6735; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 6736; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6737; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 6738; AVX-NEXT: vzeroupper 6739; AVX-NEXT: retq 6740; 6741; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6742; AVX2: # %bb.0: 6743; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6744; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6745; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 6746; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6747; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 6748; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6749; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6750; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6751; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6752; AVX2-NEXT: vzeroupper 6753; AVX2-NEXT: retq 6754; 6755; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6756; AVX512F: # %bb.0: 6757; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6758; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6759; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] 6760; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 6761; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 6762; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6763; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6764; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6765; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6766; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6767; AVX512F-NEXT: vzeroupper 6768; AVX512F-NEXT: retq 6769; 6770; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6771; AVX512DQ: # %bb.0: 6772; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6773; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6774; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] 6775; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 6776; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 6777; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6778; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6779; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6780; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 6781; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 6782; AVX512DQ-NEXT: vzeroupper 6783; AVX512DQ-NEXT: retq 6784; 6785; AVX512BW-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: 6786; AVX512BW: # %bb.0: 6787; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 6788; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 6789; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6790; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] 6791; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 6792; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 6793; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6794; AVX512BW-NEXT: vzeroupper 6795; AVX512BW-NEXT: retq 6796 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6797 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6798 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6799 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 6800 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31> 6801 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8> 6802 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6803 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6804 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6805 ret void 6806} 6807 6808define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6809; SSE2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6810; SSE2: # %bb.0: 6811; SSE2-NEXT: movdqa (%rdi), %xmm0 6812; SSE2-NEXT: paddb (%rsi), %xmm0 6813; SSE2-NEXT: xorps %xmm1, %xmm1 6814; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 6815; SSE2-NEXT: movaps 16(%rdx), %xmm0 6816; SSE2-NEXT: movaps 48(%rdx), %xmm2 6817; SSE2-NEXT: movdqa (%rdx), %xmm3 6818; SSE2-NEXT: paddb %xmm1, %xmm3 6819; SSE2-NEXT: paddb 32(%rdx), %xmm1 6820; SSE2-NEXT: movaps %xmm2, 48(%rcx) 6821; SSE2-NEXT: movaps %xmm0, 16(%rcx) 6822; SSE2-NEXT: movdqa %xmm1, 32(%rcx) 6823; SSE2-NEXT: movdqa %xmm3, (%rcx) 6824; SSE2-NEXT: retq 6825; 6826; SSE42-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6827; SSE42: # %bb.0: 6828; SSE42-NEXT: movdqa (%rdi), %xmm0 6829; SSE42-NEXT: paddb (%rsi), %xmm0 6830; SSE42-NEXT: pxor %xmm1, %xmm1 6831; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 6832; SSE42-NEXT: movaps 16(%rdx), %xmm0 6833; SSE42-NEXT: movaps 48(%rdx), %xmm2 6834; SSE42-NEXT: movdqa (%rdx), %xmm3 6835; SSE42-NEXT: paddb %xmm1, %xmm3 6836; SSE42-NEXT: paddb 32(%rdx), %xmm1 6837; SSE42-NEXT: movaps %xmm2, 48(%rcx) 6838; SSE42-NEXT: movaps %xmm0, 16(%rcx) 6839; SSE42-NEXT: movdqa %xmm1, 32(%rcx) 6840; SSE42-NEXT: movdqa %xmm3, (%rcx) 6841; SSE42-NEXT: retq 6842; 6843; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6844; AVX: # %bb.0: 6845; AVX-NEXT: vmovdqa (%rdi), %xmm0 6846; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6847; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 6848; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 6849; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 6850; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6851; AVX-NEXT: vmovaps 16(%rdx), %xmm2 6852; AVX-NEXT: vmovaps 48(%rdx), %xmm3 6853; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 6854; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 6855; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6856; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 6857; AVX-NEXT: retq 6858; 6859; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6860; AVX2: # %bb.0: 6861; AVX2-NEXT: vmovdqa (%rdi), %ymm0 6862; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6863; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 6864; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 6865; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6866; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6867; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6868; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6869; AVX2-NEXT: vzeroupper 6870; AVX2-NEXT: retq 6871; 6872; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6873; AVX512F: # %bb.0: 6874; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6875; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6876; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] 6877; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 6878; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 6879; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6880; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6881; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6882; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6883; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6884; AVX512F-NEXT: vzeroupper 6885; AVX512F-NEXT: retq 6886; 6887; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6888; AVX512DQ: # %bb.0: 6889; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6890; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6891; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] 6892; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 6893; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 6894; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6895; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6896; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6897; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 6898; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 6899; AVX512DQ-NEXT: vzeroupper 6900; AVX512DQ-NEXT: retq 6901; 6902; AVX512BW-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: 6903; AVX512BW: # %bb.0: 6904; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 6905; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6906; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 6907; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 6908; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 6909; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 6910; AVX512BW-NEXT: vzeroupper 6911; AVX512BW-NEXT: retq 6912 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 6913 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 6914 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 6915 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> 6916 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 6917 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8> 6918 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 6919 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 6920 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 6921 ret void 6922} 6923 6924define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 6925; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 6926; SSE: # %bb.0: 6927; SSE-NEXT: movdqa (%rdi), %xmm0 6928; SSE-NEXT: paddb (%rsi), %xmm0 6929; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 6930; SSE-NEXT: movdqa 16(%rdx), %xmm1 6931; SSE-NEXT: paddb %xmm0, %xmm1 6932; SSE-NEXT: movdqa (%rdx), %xmm2 6933; SSE-NEXT: paddb %xmm0, %xmm2 6934; SSE-NEXT: movdqa 48(%rdx), %xmm3 6935; SSE-NEXT: paddb %xmm0, %xmm3 6936; SSE-NEXT: paddb 32(%rdx), %xmm0 6937; SSE-NEXT: movdqa %xmm0, 32(%rcx) 6938; SSE-NEXT: movdqa %xmm3, 48(%rcx) 6939; SSE-NEXT: movdqa %xmm2, (%rcx) 6940; SSE-NEXT: movdqa %xmm1, 16(%rcx) 6941; SSE-NEXT: retq 6942; 6943; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 6944; AVX: # %bb.0: 6945; AVX-NEXT: vmovdqa (%rdi), %xmm0 6946; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 6947; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6948; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 6949; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 6950; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 6951; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 6952; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 6953; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 6954; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 6955; AVX-NEXT: vmovdqa %xmm0, (%rcx) 6956; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 6957; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) 6958; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) 6959; AVX-NEXT: vzeroupper 6960; AVX-NEXT: retq 6961; 6962; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 6963; AVX2: # %bb.0: 6964; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 6965; AVX2-NEXT: vmovdqa (%rdi), %xmm1 6966; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 6967; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 6968; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 6969; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 6970; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 6971; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 6972; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 6973; AVX2-NEXT: vzeroupper 6974; AVX2-NEXT: retq 6975; 6976; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 6977; AVX512F: # %bb.0: 6978; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 6979; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6980; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 6981; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] 6982; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 6983; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6984; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 6985; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 6986; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 6987; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 6988; AVX512F-NEXT: vzeroupper 6989; AVX512F-NEXT: retq 6990; 6991; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 6992; AVX512DQ: # %bb.0: 6993; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 6994; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 6995; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 6996; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] 6997; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 6998; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 6999; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7000; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7001; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 7002; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 7003; AVX512DQ-NEXT: vzeroupper 7004; AVX512DQ-NEXT: retq 7005; 7006; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: 7007; AVX512BW: # %bb.0: 7008; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 7009; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 7010; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 7011; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] 7012; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 7013; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 7014; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7015; AVX512BW-NEXT: vzeroupper 7016; AVX512BW-NEXT: retq 7017 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7018 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7019 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7020 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 7021 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15> 7022 %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8> 7023 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7024 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7025 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7026 ret void 7027} 7028 7029define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7030; SSE-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7031; SSE: # %bb.0: 7032; SSE-NEXT: movdqa (%rdi), %xmm0 7033; SSE-NEXT: paddb (%rsi), %xmm0 7034; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 7035; SSE-NEXT: movaps 16(%rdx), %xmm1 7036; SSE-NEXT: movaps 48(%rdx), %xmm2 7037; SSE-NEXT: movdqa (%rdx), %xmm3 7038; SSE-NEXT: paddb %xmm0, %xmm3 7039; SSE-NEXT: paddb 32(%rdx), %xmm0 7040; SSE-NEXT: movaps %xmm2, 48(%rcx) 7041; SSE-NEXT: movaps %xmm1, 16(%rcx) 7042; SSE-NEXT: movdqa %xmm0, 32(%rcx) 7043; SSE-NEXT: movdqa %xmm3, (%rcx) 7044; SSE-NEXT: retq 7045; 7046; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7047; AVX: # %bb.0: 7048; AVX-NEXT: vmovdqa (%rdi), %xmm0 7049; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7050; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7051; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 7052; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7053; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7054; AVX-NEXT: vmovaps 48(%rdx), %xmm3 7055; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7056; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 7057; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7058; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7059; AVX-NEXT: retq 7060; 7061; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7062; AVX2: # %bb.0: 7063; AVX2-NEXT: vmovdqa (%rdi), %ymm0 7064; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7065; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 7066; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 7067; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7068; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7069; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 7070; AVX2-NEXT: vzeroupper 7071; AVX2-NEXT: retq 7072; 7073; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7074; AVX512F: # %bb.0: 7075; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7076; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7077; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] 7078; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 7079; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 7080; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 7081; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7082; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7083; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 7084; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 7085; AVX512F-NEXT: vzeroupper 7086; AVX512F-NEXT: retq 7087; 7088; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7089; AVX512DQ: # %bb.0: 7090; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 7091; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7092; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] 7093; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 7094; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 7095; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 7096; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7097; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7098; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 7099; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 7100; AVX512DQ-NEXT: vzeroupper 7101; AVX512DQ-NEXT: retq 7102; 7103; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: 7104; AVX512BW: # %bb.0: 7105; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 7106; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7107; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 7108; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 7109; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7110; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7111; AVX512BW-NEXT: vzeroupper 7112; AVX512BW-NEXT: retq 7113 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7114 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7115 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7116 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> 7117 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15> 7118 %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8> 7119 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7120 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7121 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7122 ret void 7123} 7124 7125define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { 7126; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7127; SSE: # %bb.0: 7128; SSE-NEXT: movdqa (%rdi), %xmm0 7129; SSE-NEXT: paddb (%rsi), %xmm0 7130; SSE-NEXT: movaps 16(%rdx), %xmm1 7131; SSE-NEXT: movaps 48(%rdx), %xmm2 7132; SSE-NEXT: movdqa (%rdx), %xmm3 7133; SSE-NEXT: paddb %xmm0, %xmm3 7134; SSE-NEXT: paddb 32(%rdx), %xmm0 7135; SSE-NEXT: movaps %xmm2, 48(%rcx) 7136; SSE-NEXT: movaps %xmm1, 16(%rcx) 7137; SSE-NEXT: movdqa %xmm0, 32(%rcx) 7138; SSE-NEXT: movdqa %xmm3, (%rcx) 7139; SSE-NEXT: retq 7140; 7141; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7142; AVX: # %bb.0: 7143; AVX-NEXT: vmovdqa (%rdi), %xmm0 7144; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7145; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 7146; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 7147; AVX-NEXT: vmovaps 16(%rdx), %xmm2 7148; AVX-NEXT: vmovaps 48(%rdx), %xmm3 7149; AVX-NEXT: vmovaps %xmm2, 16(%rcx) 7150; AVX-NEXT: vmovaps %xmm3, 48(%rcx) 7151; AVX-NEXT: vmovdqa %xmm0, (%rcx) 7152; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) 7153; AVX-NEXT: retq 7154; 7155; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7156; AVX2: # %bb.0: 7157; AVX2-NEXT: vmovdqa (%rdi), %xmm0 7158; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7159; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 7160; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 7161; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 7162; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 7163; AVX2-NEXT: vzeroupper 7164; AVX2-NEXT: retq 7165; 7166; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7167; AVX512F: # %bb.0: 7168; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 7169; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7170; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 7171; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] 7172; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 7173; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 7174; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7175; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7176; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) 7177; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) 7178; AVX512F-NEXT: vzeroupper 7179; AVX512F-NEXT: retq 7180; 7181; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7182; AVX512DQ: # %bb.0: 7183; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 7184; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 7185; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 7186; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] 7187; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 7188; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 7189; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 7190; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 7191; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) 7192; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) 7193; AVX512DQ-NEXT: vzeroupper 7194; AVX512DQ-NEXT: retq 7195; 7196; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: 7197; AVX512BW: # %bb.0: 7198; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 7199; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 7200; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 7201; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 7202; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 7203; AVX512BW-NEXT: vzeroupper 7204; AVX512BW-NEXT: retq 7205 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 7206 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 7207 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias 7208 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> 7209 %broadcast.of.zextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7> 7210 %out.bytevec = bitcast <4 x i128> %broadcast.of.zextinreg to <64 x i8> 7211 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 7212 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias 7213 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 7214 ret void 7215} 7216;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 7217; AVX1-ONLY: {{.*}} 7218; FALLBACK0: {{.*}} 7219; FALLBACK1: {{.*}} 7220; FALLBACK10: {{.*}} 7221; FALLBACK11: {{.*}} 7222; FALLBACK12: {{.*}} 7223; FALLBACK13: {{.*}} 7224; FALLBACK2: {{.*}} 7225; FALLBACK3: {{.*}} 7226; FALLBACK4: {{.*}} 7227; FALLBACK5: {{.*}} 7228; FALLBACK6: {{.*}} 7229; FALLBACK7: {{.*}} 7230; FALLBACK8: {{.*}} 7231; FALLBACK9: {{.*}} 7232