1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i16_stride5_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: movdqa (%rdx), %xmm1 23; SSE-NEXT: movdqa (%r8), %xmm2 24; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 25; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 26; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 27; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] 28; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 29; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 30; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 31; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5] 32; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] 33; SSE-NEXT: pand %xmm3, %xmm1 34; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] 35; SSE-NEXT: pandn %xmm4, %xmm3 36; SSE-NEXT: por %xmm1, %xmm3 37; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 38; SSE-NEXT: psrld $16, %xmm2 39; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 40; SSE-NEXT: movd %xmm0, 16(%r9) 41; SSE-NEXT: movdqa %xmm3, (%r9) 42; SSE-NEXT: retq 43; 44; AVX-LABEL: store_i16_stride5_vf2: 45; AVX: # %bb.0: 46; AVX-NEXT: vmovdqa (%rdi), %xmm0 47; AVX-NEXT: vmovdqa (%rdx), %xmm1 48; AVX-NEXT: vmovdqa (%r8), %xmm2 49; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 50; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 51; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 52; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] 53; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] 54; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] 55; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 56; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 57; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 58; AVX-NEXT: vmovd %xmm1, 16(%r9) 59; AVX-NEXT: vmovdqa %xmm0, (%r9) 60; AVX-NEXT: retq 61; 62; AVX2-LABEL: store_i16_stride5_vf2: 63; AVX2: # %bb.0: 64; AVX2-NEXT: vmovdqa (%rdi), %xmm0 65; AVX2-NEXT: vmovdqa (%rdx), %xmm1 66; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 67; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 68; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 69; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 70; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] 71; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 72; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7] 73; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] 74; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,65535,65535,65535,0,65535,0,0,0,0,0,0] 75; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 76; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 77; AVX2-NEXT: vmovd %xmm1, 16(%r9) 78; AVX2-NEXT: vmovdqa %xmm0, (%r9) 79; AVX2-NEXT: vzeroupper 80; AVX2-NEXT: retq 81; 82; AVX2-FP-LABEL: store_i16_stride5_vf2: 83; AVX2-FP: # %bb.0: 84; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 85; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 86; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 87; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 88; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 89; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 90; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 91; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 92; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31] 93; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 94; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 95; AVX2-FP-NEXT: vmovd %xmm1, 16(%r9) 96; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9) 97; AVX2-FP-NEXT: vzeroupper 98; AVX2-FP-NEXT: retq 99; 100; AVX2-FCP-LABEL: store_i16_stride5_vf2: 101; AVX2-FCP: # %bb.0: 102; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 103; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 104; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 105; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 106; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 107; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 108; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 109; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 110; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31] 111; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 112; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 113; AVX2-FCP-NEXT: vmovd %xmm1, 16(%r9) 114; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9) 115; AVX2-FCP-NEXT: vzeroupper 116; AVX2-FCP-NEXT: retq 117; 118; AVX512-LABEL: store_i16_stride5_vf2: 119; AVX512: # %bb.0: 120; AVX512-NEXT: vmovdqa (%rdi), %xmm0 121; AVX512-NEXT: vmovdqa (%rdx), %xmm1 122; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 123; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 124; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 125; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 126; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] 127; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 128; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 129; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 130; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 131; AVX512-NEXT: vmovd %xmm1, 16(%r9) 132; AVX512-NEXT: vmovdqa %xmm0, (%r9) 133; AVX512-NEXT: vzeroupper 134; AVX512-NEXT: retq 135; 136; AVX512-FCP-LABEL: store_i16_stride5_vf2: 137; AVX512-FCP: # %bb.0: 138; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 139; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 140; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 141; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 142; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 143; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 144; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] 145; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 146; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 147; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 148; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 149; AVX512-FCP-NEXT: vmovd %xmm1, 16(%r9) 150; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r9) 151; AVX512-FCP-NEXT: vzeroupper 152; AVX512-FCP-NEXT: retq 153; 154; AVX512DQ-LABEL: store_i16_stride5_vf2: 155; AVX512DQ: # %bb.0: 156; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 157; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 158; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 159; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 160; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 161; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 162; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] 163; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 164; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 165; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 166; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 167; AVX512DQ-NEXT: vmovd %xmm1, 16(%r9) 168; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9) 169; AVX512DQ-NEXT: vzeroupper 170; AVX512DQ-NEXT: retq 171; 172; AVX512DQ-FCP-LABEL: store_i16_stride5_vf2: 173; AVX512DQ-FCP: # %bb.0: 174; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 175; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 176; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 177; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 178; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 179; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 180; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] 181; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 182; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 183; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 184; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 185; AVX512DQ-FCP-NEXT: vmovd %xmm1, 16(%r9) 186; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r9) 187; AVX512DQ-FCP-NEXT: vzeroupper 188; AVX512DQ-FCP-NEXT: retq 189; 190; AVX512BW-LABEL: store_i16_stride5_vf2: 191; AVX512BW: # %bb.0: 192; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 193; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 194; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 195; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 196; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 197; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 198; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] 199; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 200; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 201; AVX512BW-NEXT: vmovd %xmm1, 16(%r9) 202; AVX512BW-NEXT: vmovdqa %xmm0, (%r9) 203; AVX512BW-NEXT: vzeroupper 204; AVX512BW-NEXT: retq 205; 206; AVX512BW-FCP-LABEL: store_i16_stride5_vf2: 207; AVX512BW-FCP: # %bb.0: 208; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 209; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 210; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 211; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 212; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 213; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 214; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] 215; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 216; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 217; AVX512BW-FCP-NEXT: vmovd %xmm1, 16(%r9) 218; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 219; AVX512BW-FCP-NEXT: vzeroupper 220; AVX512BW-FCP-NEXT: retq 221; 222; AVX512DQ-BW-LABEL: store_i16_stride5_vf2: 223; AVX512DQ-BW: # %bb.0: 224; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 225; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 226; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 227; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 228; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 229; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 230; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] 231; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 232; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 233; AVX512DQ-BW-NEXT: vmovd %xmm1, 16(%r9) 234; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%r9) 235; AVX512DQ-BW-NEXT: vzeroupper 236; AVX512DQ-BW-NEXT: retq 237; 238; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf2: 239; AVX512DQ-BW-FCP: # %bb.0: 240; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 241; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 242; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 243; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 244; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 245; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 246; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] 247; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 248; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 249; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, 16(%r9) 250; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 251; AVX512DQ-BW-FCP-NEXT: vzeroupper 252; AVX512DQ-BW-FCP-NEXT: retq 253 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 254 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64 255 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64 256 %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 64 257 %in.vec4 = load <2 x i16>, ptr %in.vecptr4, align 64 258 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 259 %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 260 %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 261 %4 = shufflevector <2 x i16> %in.vec4, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 262 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 263 %interleaved.vec = shufflevector <10 x i16> %5, <10 x i16> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9> 264 store <10 x i16> %interleaved.vec, ptr %out.vec, align 64 265 ret void 266} 267 268define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 269; SSE-LABEL: store_i16_stride5_vf4: 270; SSE: # %bb.0: 271; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 272; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero 273; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 274; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 275; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 276; SSE-NEXT: movdqa %xmm1, %xmm4 277; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 278; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 279; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 280; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] 281; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 282; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] 283; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 284; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,3,4,5,6,7] 285; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] 286; SSE-NEXT: pand %xmm7, %xmm4 287; SSE-NEXT: pandn %xmm6, %xmm7 288; SSE-NEXT: por %xmm4, %xmm7 289; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] 290; SSE-NEXT: pand %xmm4, %xmm7 291; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] 292; SSE-NEXT: pandn %xmm6, %xmm4 293; SSE-NEXT: por %xmm7, %xmm4 294; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 295; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] 296; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 297; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,0,65535,65535,65535,0] 298; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] 299; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] 300; SSE-NEXT: pand %xmm7, %xmm2 301; SSE-NEXT: pandn %xmm3, %xmm7 302; SSE-NEXT: por %xmm2, %xmm7 303; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] 304; SSE-NEXT: pand %xmm2, %xmm7 305; SSE-NEXT: pandn %xmm6, %xmm2 306; SSE-NEXT: por %xmm7, %xmm2 307; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] 308; SSE-NEXT: psrlq $48, %xmm1 309; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 310; SSE-NEXT: pand %xmm3, %xmm5 311; SSE-NEXT: pandn %xmm0, %xmm3 312; SSE-NEXT: por %xmm5, %xmm3 313; SSE-NEXT: movq %xmm3, 32(%r9) 314; SSE-NEXT: movdqa %xmm2, (%r9) 315; SSE-NEXT: movdqa %xmm4, 16(%r9) 316; SSE-NEXT: retq 317; 318; AVX-LABEL: store_i16_stride5_vf4: 319; AVX: # %bb.0: 320; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 321; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 322; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 323; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 324; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 325; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 326; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 327; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 328; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] 329; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] 330; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] 331; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] 332; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u] 333; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,1,2,1] 334; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] 335; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 336; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 337; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] 338; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3] 339; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 340; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 341; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 342; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] 343; AVX-NEXT: vmovdqa %xmm0, (%r9) 344; AVX-NEXT: vmovdqa %xmm4, 16(%r9) 345; AVX-NEXT: vmovq %xmm3, 32(%r9) 346; AVX-NEXT: retq 347; 348; AVX2-LABEL: store_i16_stride5_vf4: 349; AVX2: # %bb.0: 350; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 351; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 352; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 353; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 354; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 355; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 356; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 357; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 358; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 359; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 360; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm5 361; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 362; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 363; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 364; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 365; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 366; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 367; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 368; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 369; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 370; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 371; AVX2-NEXT: vmovq %xmm0, 32(%r9) 372; AVX2-NEXT: vmovdqa %ymm2, (%r9) 373; AVX2-NEXT: vzeroupper 374; AVX2-NEXT: retq 375; 376; AVX2-FP-LABEL: store_i16_stride5_vf4: 377; AVX2-FP: # %bb.0: 378; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 379; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 380; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 381; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 382; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 383; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 384; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 385; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 386; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 387; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 388; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 389; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 390; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 391; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 392; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 393; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 394; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 395; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] 396; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 397; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 398; AVX2-FP-NEXT: vmovq %xmm0, 32(%r9) 399; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) 400; AVX2-FP-NEXT: vzeroupper 401; AVX2-FP-NEXT: retq 402; 403; AVX2-FCP-LABEL: store_i16_stride5_vf4: 404; AVX2-FCP: # %bb.0: 405; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 406; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 407; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 408; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 409; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 410; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 411; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 412; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 413; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 414; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 415; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 416; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 417; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 418; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 419; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 420; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 421; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 422; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] 423; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 424; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 425; AVX2-FCP-NEXT: vmovq %xmm0, 32(%r9) 426; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) 427; AVX2-FCP-NEXT: vzeroupper 428; AVX2-FCP-NEXT: retq 429; 430; AVX512-LABEL: store_i16_stride5_vf4: 431; AVX512: # %bb.0: 432; AVX512-NEXT: movq (%r8), %rax 433; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 434; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 435; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 436; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 437; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 438; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 439; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 440; AVX512-NEXT: vpbroadcastq %rax, %ymm3 441; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 442; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm5 443; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 444; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 445; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 446; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) 447; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 448; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 449; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 450; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 451; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 452; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 453; AVX512-NEXT: vmovq %xmm0, 32(%r9) 454; AVX512-NEXT: vmovdqa %ymm1, (%r9) 455; AVX512-NEXT: vzeroupper 456; AVX512-NEXT: retq 457; 458; AVX512-FCP-LABEL: store_i16_stride5_vf4: 459; AVX512-FCP: # %bb.0: 460; AVX512-FCP-NEXT: movq (%r8), %rax 461; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 462; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 463; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 464; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 465; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 466; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 467; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 468; AVX512-FCP-NEXT: vpbroadcastq %rax, %ymm3 469; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 470; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 471; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 472; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 473; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 474; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) 475; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 476; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] 477; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 478; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 479; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 480; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9) 481; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9) 482; AVX512-FCP-NEXT: vzeroupper 483; AVX512-FCP-NEXT: retq 484; 485; AVX512DQ-LABEL: store_i16_stride5_vf4: 486; AVX512DQ: # %bb.0: 487; AVX512DQ-NEXT: movq (%r8), %rax 488; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 489; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 490; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 491; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 492; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 493; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 494; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 495; AVX512DQ-NEXT: vpbroadcastq %rax, %ymm3 496; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 497; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm5 498; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 499; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 500; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 501; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) 502; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 503; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 504; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 505; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 506; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 507; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 508; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9) 509; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9) 510; AVX512DQ-NEXT: vzeroupper 511; AVX512DQ-NEXT: retq 512; 513; AVX512DQ-FCP-LABEL: store_i16_stride5_vf4: 514; AVX512DQ-FCP: # %bb.0: 515; AVX512DQ-FCP-NEXT: movq (%r8), %rax 516; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 517; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 518; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 519; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 520; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 521; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 522; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 523; AVX512DQ-FCP-NEXT: vpbroadcastq %rax, %ymm3 524; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7] 525; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 526; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 527; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 528; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] 529; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) 530; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 531; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] 532; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 533; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 534; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 535; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9) 536; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9) 537; AVX512DQ-FCP-NEXT: vzeroupper 538; AVX512DQ-FCP-NEXT: retq 539; 540; AVX512BW-LABEL: store_i16_stride5_vf4: 541; AVX512BW: # %bb.0: 542; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 543; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 544; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 545; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 546; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 547; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 548; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 549; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 550; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 551; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] 552; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 553; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 554; AVX512BW-NEXT: vmovq %xmm1, 32(%r9) 555; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) 556; AVX512BW-NEXT: vzeroupper 557; AVX512BW-NEXT: retq 558; 559; AVX512BW-FCP-LABEL: store_i16_stride5_vf4: 560; AVX512BW-FCP: # %bb.0: 561; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 562; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 563; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 564; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 565; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 566; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 567; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 568; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 569; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 570; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] 571; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 572; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 573; AVX512BW-FCP-NEXT: vmovq %xmm1, 32(%r9) 574; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 575; AVX512BW-FCP-NEXT: vzeroupper 576; AVX512BW-FCP-NEXT: retq 577; 578; AVX512DQ-BW-LABEL: store_i16_stride5_vf4: 579; AVX512DQ-BW: # %bb.0: 580; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 581; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 582; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 583; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 584; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 585; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 586; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 588; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 589; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] 590; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 591; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 592; AVX512DQ-BW-NEXT: vmovq %xmm1, 32(%r9) 593; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r9) 594; AVX512DQ-BW-NEXT: vzeroupper 595; AVX512DQ-BW-NEXT: retq 596; 597; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf4: 598; AVX512DQ-BW-FCP: # %bb.0: 599; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 600; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 601; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 602; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 603; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 604; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 605; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 606; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 607; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 608; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] 609; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 610; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 611; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 32(%r9) 612; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 613; AVX512DQ-BW-FCP-NEXT: vzeroupper 614; AVX512DQ-BW-FCP-NEXT: retq 615 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 616 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64 617 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 64 618 %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 64 619 %in.vec4 = load <4 x i16>, ptr %in.vecptr4, align 64 620 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 621 %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 622 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 623 %4 = shufflevector <4 x i16> %in.vec4, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 624 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19> 625 %interleaved.vec = shufflevector <20 x i16> %5, <20 x i16> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19> 626 store <20 x i16> %interleaved.vec, ptr %out.vec, align 64 627 ret void 628} 629 630define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 631; SSE-LABEL: store_i16_stride5_vf8: 632; SSE: # %bb.0: 633; SSE-NEXT: movdqa (%rdi), %xmm5 634; SSE-NEXT: movdqa (%rsi), %xmm7 635; SSE-NEXT: movdqa (%rdx), %xmm2 636; SSE-NEXT: movdqa (%rcx), %xmm3 637; SSE-NEXT: movdqa (%r8), %xmm6 638; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] 639; SSE-NEXT: movdqa %xmm0, %xmm1 640; SSE-NEXT: pandn %xmm5, %xmm1 641; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] 642; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 643; SSE-NEXT: pand %xmm0, %xmm4 644; SSE-NEXT: por %xmm1, %xmm4 645; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] 646; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] 647; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] 648; SSE-NEXT: pand %xmm1, %xmm9 649; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] 650; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] 651; SSE-NEXT: movdqa %xmm1, %xmm11 652; SSE-NEXT: pandn %xmm10, %xmm11 653; SSE-NEXT: por %xmm9, %xmm11 654; SSE-NEXT: pand %xmm8, %xmm11 655; SSE-NEXT: pandn %xmm4, %xmm8 656; SSE-NEXT: por %xmm11, %xmm8 657; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] 658; SSE-NEXT: pand %xmm4, %xmm8 659; SSE-NEXT: pandn %xmm6, %xmm4 660; SSE-NEXT: por %xmm8, %xmm4 661; SSE-NEXT: movdqa %xmm2, %xmm8 662; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] 663; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] 664; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] 665; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] 666; SSE-NEXT: movdqa %xmm5, %xmm9 667; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 668; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] 669; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] 670; SSE-NEXT: pand %xmm10, %xmm11 671; SSE-NEXT: pandn %xmm8, %xmm10 672; SSE-NEXT: por %xmm11, %xmm10 673; SSE-NEXT: pand %xmm0, %xmm10 674; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] 675; SSE-NEXT: pandn %xmm8, %xmm0 676; SSE-NEXT: por %xmm10, %xmm0 677; SSE-NEXT: movdqa %xmm5, %xmm10 678; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 679; SSE-NEXT: psrlq $48, %xmm7 680; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] 681; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] 682; SSE-NEXT: movdqa %xmm7, %xmm11 683; SSE-NEXT: pandn %xmm10, %xmm11 684; SSE-NEXT: movdqa %xmm2, %xmm10 685; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] 686; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] 687; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] 688; SSE-NEXT: pand %xmm7, %xmm12 689; SSE-NEXT: por %xmm11, %xmm12 690; SSE-NEXT: pand %xmm1, %xmm12 691; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 692; SSE-NEXT: pandn %xmm6, %xmm1 693; SSE-NEXT: por %xmm12, %xmm1 694; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 695; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] 696; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] 697; SSE-NEXT: pand %xmm7, %xmm5 698; SSE-NEXT: pandn %xmm10, %xmm7 699; SSE-NEXT: por %xmm5, %xmm7 700; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,65535] 701; SSE-NEXT: pand %xmm5, %xmm7 702; SSE-NEXT: pandn %xmm6, %xmm5 703; SSE-NEXT: por %xmm7, %xmm5 704; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] 705; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 706; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 707; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] 708; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 709; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 710; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 711; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] 712; SSE-NEXT: pand %xmm3, %xmm2 713; SSE-NEXT: pandn %xmm8, %xmm3 714; SSE-NEXT: por %xmm2, %xmm3 715; SSE-NEXT: movdqa %xmm3, 16(%r9) 716; SSE-NEXT: movdqa %xmm5, 48(%r9) 717; SSE-NEXT: movdqa %xmm1, 64(%r9) 718; SSE-NEXT: movdqa %xmm0, (%r9) 719; SSE-NEXT: movdqa %xmm4, 32(%r9) 720; SSE-NEXT: retq 721; 722; AVX-LABEL: store_i16_stride5_vf8: 723; AVX: # %bb.0: 724; AVX-NEXT: vmovdqa (%rdi), %xmm2 725; AVX-NEXT: vmovdqa (%rsi), %xmm5 726; AVX-NEXT: vmovdqa (%rdx), %xmm3 727; AVX-NEXT: vmovdqa (%rcx), %xmm4 728; AVX-NEXT: vmovdqa (%r8), %xmm0 729; AVX-NEXT: vpsrlq $48, %xmm5, %xmm1 730; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] 731; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 732; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] 733; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] 734; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] 735; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 736; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] 737; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 738; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 739; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] 740; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] 741; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6],xmm7[7] 742; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] 743; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] 744; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] 745; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 746; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 747; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] 748; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] 749; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] 750; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] 751; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 752; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 753; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] 754; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] 755; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] 756; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] 757; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1,2,3,4],xmm9[5],xmm6[6,7] 758; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] 759; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 760; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] 761; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] 762; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] 763; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 764; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] 765; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 766; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6,7] 767; AVX-NEXT: vmovdqa %xmm0, 32(%r9) 768; AVX-NEXT: vmovdqa %xmm6, 48(%r9) 769; AVX-NEXT: vmovdqa %xmm8, 16(%r9) 770; AVX-NEXT: vmovdqa %xmm7, (%r9) 771; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm9[2],xmm1[3,4,5,6],xmm9[7] 772; AVX-NEXT: vmovdqa %xmm0, 64(%r9) 773; AVX-NEXT: retq 774; 775; AVX2-LABEL: store_i16_stride5_vf8: 776; AVX2: # %bb.0: 777; AVX2-NEXT: vmovdqa (%rdi), %xmm0 778; AVX2-NEXT: vmovdqa (%rsi), %xmm2 779; AVX2-NEXT: vmovdqa (%rdx), %xmm1 780; AVX2-NEXT: vmovdqa (%rcx), %xmm3 781; AVX2-NEXT: vmovdqa (%r8), %xmm4 782; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm5 783; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 784; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] 785; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] 786; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 787; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,10,11,u,u,6,7,u,u,8,9,12,13,u,u,22,23,26,27,u,u,22,23,u,u,24,25,28,29,u,u] 788; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] 789; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] 790; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,6,7,10,11,u,u,u,u,u,u,8,9,u,u,u,u,22,23,26,27,u,u,u,u,u,u,24,25] 791; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] 792; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] 793; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] 794; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 795; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 796; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 797; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 798; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 799; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 800; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero 801; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] 802; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] 803; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 804; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 805; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 806; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 807; AVX2-NEXT: vpsrlq $48, %xmm2, %xmm2 808; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] 809; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 810; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] 811; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 812; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 813; AVX2-NEXT: vpbroadcastd 12(%r8), %xmm1 814; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 815; AVX2-NEXT: vmovdqa %xmm0, 64(%r9) 816; AVX2-NEXT: vmovdqa %ymm5, (%r9) 817; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) 818; AVX2-NEXT: vzeroupper 819; AVX2-NEXT: retq 820; 821; AVX2-FP-LABEL: store_i16_stride5_vf8: 822; AVX2-FP: # %bb.0: 823; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 824; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2 825; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 826; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3 827; AVX2-FP-NEXT: vmovdqa (%r8), %xmm4 828; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm5 829; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 830; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13,22,23,26,27,26,27,22,23,24,25,24,25,28,29,28,29] 831; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 832; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] 833; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 834; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3],ymm8[4],ymm7[5,6],ymm8[7],ymm7[8,9],ymm8[10],ymm7[11],ymm8[12],ymm7[13,14],ymm8[15] 835; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,6,7,6,7,10,11,10,11,10,11,8,9,8,9,22,23,22,23,22,23,26,27,26,27,26,27,24,25,24,25] 836; AVX2-FP-NEXT: vpshufb %ymm8, %ymm6, %ymm9 837; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] 838; AVX2-FP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 839; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] 840; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 841; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 842; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 843; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 844; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 845; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 846; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero 847; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] 848; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] 849; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5 850; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm6 851; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 852; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 853; AVX2-FP-NEXT: vpsrlq $48, %xmm2, %xmm2 854; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] 855; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 856; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 857; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 858; AVX2-FP-NEXT: vpbroadcastd 12(%r8), %xmm1 859; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 860; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9) 861; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) 862; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9) 863; AVX2-FP-NEXT: vzeroupper 864; AVX2-FP-NEXT: retq 865; 866; AVX2-FCP-LABEL: store_i16_stride5_vf8: 867; AVX2-FCP: # %bb.0: 868; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 869; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 870; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 871; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 872; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4 873; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 874; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 875; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,2,6,0,0] 876; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 877; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero 878; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] 879; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 880; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29] 881; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 882; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 883; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 884; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 885; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 886; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero 887; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] 888; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] 889; AVX2-FCP-NEXT: vpor %ymm6, %ymm5, %ymm5 890; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm6 891; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 892; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 893; AVX2-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 894; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 895; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 896; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 897; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 898; AVX2-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 899; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 900; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 901; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) 902; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r9) 903; AVX2-FCP-NEXT: vzeroupper 904; AVX2-FCP-NEXT: retq 905; 906; AVX512-LABEL: store_i16_stride5_vf8: 907; AVX512: # %bb.0: 908; AVX512-NEXT: vmovdqa (%rdi), %xmm2 909; AVX512-NEXT: vmovdqa (%rsi), %xmm3 910; AVX512-NEXT: vmovdqa (%rdx), %xmm0 911; AVX512-NEXT: vmovdqa (%rcx), %xmm1 912; AVX512-NEXT: vmovdqa (%r8), %xmm4 913; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 914; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 915; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 916; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] 917; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7] 918; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] 919; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7],ymm8[8,9],ymm7[10,11],ymm8[12,13,14],ymm7[15] 920; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 921; AVX512-NEXT: vpandn %ymm7, %ymm8, %ymm7 922; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 923; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero 924; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 925; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] 926; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] 927; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] 928; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u,28,29,u,u] 929; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] 930; AVX512-NEXT: vpand %ymm7, %ymm8, %ymm7 931; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] 932; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,ymm5[u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,22,23] 933; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 934; AVX512-NEXT: vporq %zmm6, %zmm5, %zmm5 935; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 936; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 937; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 938; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) 939; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm3 940; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 941; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 942; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 943; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 944; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 945; AVX512-NEXT: vpbroadcastd 12(%r8), %xmm1 946; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 947; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) 948; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) 949; AVX512-NEXT: vzeroupper 950; AVX512-NEXT: retq 951; 952; AVX512-FCP-LABEL: store_i16_stride5_vf8: 953; AVX512-FCP: # %bb.0: 954; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 955; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 956; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 957; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 958; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 959; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 960; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 961; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] 962; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero 963; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] 964; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 965; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero 966; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 967; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] 968; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] 969; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] 970; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 971; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] 972; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 973; AVX512-FCP-NEXT: vporq %zmm6, %zmm5, %zmm5 974; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 975; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 976; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 977; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) 978; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 979; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 980; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 981; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 982; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 983; AVX512-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 984; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 985; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 986; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 987; AVX512-FCP-NEXT: vzeroupper 988; AVX512-FCP-NEXT: retq 989; 990; AVX512DQ-LABEL: store_i16_stride5_vf8: 991; AVX512DQ: # %bb.0: 992; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 993; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 994; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 995; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 996; AVX512DQ-NEXT: vmovdqa (%r8), %xmm4 997; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 998; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 999; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 1000; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] 1001; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7] 1002; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] 1003; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7],ymm8[8,9],ymm7[10,11],ymm8[12,13,14],ymm7[15] 1004; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 1005; AVX512DQ-NEXT: vpandn %ymm7, %ymm8, %ymm7 1006; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 1007; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero 1008; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 1009; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] 1010; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] 1011; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] 1012; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u,28,29,u,u] 1013; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] 1014; AVX512DQ-NEXT: vpand %ymm7, %ymm8, %ymm7 1015; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] 1016; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,ymm5[u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,22,23] 1017; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 1018; AVX512DQ-NEXT: vporq %zmm6, %zmm5, %zmm5 1019; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 1020; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 1021; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 1022; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) 1023; AVX512DQ-NEXT: vpsrlq $48, %xmm3, %xmm3 1024; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 1025; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1026; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 1027; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 1028; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 1029; AVX512DQ-NEXT: vpbroadcastd 12(%r8), %xmm1 1030; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 1031; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) 1032; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) 1033; AVX512DQ-NEXT: vzeroupper 1034; AVX512DQ-NEXT: retq 1035; 1036; AVX512DQ-FCP-LABEL: store_i16_stride5_vf8: 1037; AVX512DQ-FCP: # %bb.0: 1038; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 1039; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 1040; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 1041; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 1042; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4 1043; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 1044; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 1045; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] 1046; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero 1047; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] 1048; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 1049; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero 1050; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 1051; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] 1052; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] 1053; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] 1054; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 1055; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] 1056; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 1057; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm5, %zmm5 1058; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 1059; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 1060; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 1061; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) 1062; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 1063; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1064; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1065; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 1066; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 1067; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 1068; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 1069; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1070; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 1071; AVX512DQ-FCP-NEXT: vzeroupper 1072; AVX512DQ-FCP-NEXT: retq 1073; 1074; AVX512BW-LABEL: store_i16_stride5_vf8: 1075; AVX512BW: # %bb.0: 1076; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1077; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 1078; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 1079; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1080; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1081; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1082; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] 1083; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 1084; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] 1085; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 1086; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) 1087; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) 1088; AVX512BW-NEXT: vzeroupper 1089; AVX512BW-NEXT: retq 1090; 1091; AVX512BW-FCP-LABEL: store_i16_stride5_vf8: 1092; AVX512BW-FCP: # %bb.0: 1093; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1094; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1095; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1096; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1097; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1098; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1099; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] 1100; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 1101; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] 1102; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 1103; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 1104; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) 1105; AVX512BW-FCP-NEXT: vzeroupper 1106; AVX512BW-FCP-NEXT: retq 1107; 1108; AVX512DQ-BW-LABEL: store_i16_stride5_vf8: 1109; AVX512DQ-BW: # %bb.0: 1110; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 1111; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 1112; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 1113; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1114; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1115; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1116; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] 1117; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 1118; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] 1119; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 1120; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) 1121; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) 1122; AVX512DQ-BW-NEXT: vzeroupper 1123; AVX512DQ-BW-NEXT: retq 1124; 1125; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf8: 1126; AVX512DQ-BW-FCP: # %bb.0: 1127; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1128; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1129; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1130; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1131; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1132; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1133; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] 1134; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 1135; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] 1136; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 1137; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 1138; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) 1139; AVX512DQ-BW-FCP-NEXT: vzeroupper 1140; AVX512DQ-BW-FCP-NEXT: retq 1141 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64 1142 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64 1143 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 64 1144 %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 64 1145 %in.vec4 = load <8 x i16>, ptr %in.vecptr4, align 64 1146 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1147 %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1148 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1149 %4 = shufflevector <8 x i16> %in.vec4, <8 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1150 %5 = shufflevector <32 x i16> %3, <32 x i16> %4, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39> 1151 %interleaved.vec = shufflevector <40 x i16> %5, <40 x i16> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39> 1152 store <40 x i16> %interleaved.vec, ptr %out.vec, align 64 1153 ret void 1154} 1155 1156define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 1157; SSE-LABEL: store_i16_stride5_vf16: 1158; SSE: # %bb.0: 1159; SSE-NEXT: movdqa (%rdi), %xmm15 1160; SSE-NEXT: movdqa 16(%rdi), %xmm5 1161; SSE-NEXT: movdqa (%rsi), %xmm8 1162; SSE-NEXT: movdqa 16(%rsi), %xmm0 1163; SSE-NEXT: movdqa 16(%rdx), %xmm10 1164; SSE-NEXT: movdqa (%rcx), %xmm14 1165; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1166; SSE-NEXT: movdqa 16(%rcx), %xmm11 1167; SSE-NEXT: movdqa 16(%r8), %xmm3 1168; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 1169; SSE-NEXT: movdqa %xmm1, %xmm2 1170; SSE-NEXT: pandn %xmm5, %xmm2 1171; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] 1172; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 1173; SSE-NEXT: pand %xmm1, %xmm4 1174; SSE-NEXT: por %xmm2, %xmm4 1175; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] 1176; SSE-NEXT: movdqa %xmm12, %xmm6 1177; SSE-NEXT: pandn %xmm4, %xmm6 1178; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] 1179; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] 1180; SSE-NEXT: pand %xmm9, %xmm4 1181; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] 1182; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 1183; SSE-NEXT: movdqa %xmm9, %xmm13 1184; SSE-NEXT: pandn %xmm7, %xmm13 1185; SSE-NEXT: por %xmm4, %xmm13 1186; SSE-NEXT: pand %xmm12, %xmm13 1187; SSE-NEXT: por %xmm6, %xmm13 1188; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] 1189; SSE-NEXT: pand %xmm2, %xmm13 1190; SSE-NEXT: movdqa %xmm2, %xmm1 1191; SSE-NEXT: pandn %xmm3, %xmm1 1192; SSE-NEXT: por %xmm13, %xmm1 1193; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1194; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 1195; SSE-NEXT: movdqa %xmm1, %xmm7 1196; SSE-NEXT: pandn %xmm15, %xmm7 1197; SSE-NEXT: movdqa %xmm15, %xmm6 1198; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] 1199; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] 1200; SSE-NEXT: pand %xmm1, %xmm13 1201; SSE-NEXT: por %xmm7, %xmm13 1202; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] 1203; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 1204; SSE-NEXT: movdqa %xmm9, %xmm14 1205; SSE-NEXT: pandn %xmm7, %xmm14 1206; SSE-NEXT: movdqa (%rdx), %xmm4 1207; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] 1208; SSE-NEXT: pand %xmm9, %xmm15 1209; SSE-NEXT: por %xmm15, %xmm14 1210; SSE-NEXT: pand %xmm12, %xmm14 1211; SSE-NEXT: pandn %xmm13, %xmm12 1212; SSE-NEXT: movdqa (%r8), %xmm1 1213; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1214; SSE-NEXT: por %xmm14, %xmm12 1215; SSE-NEXT: pand %xmm2, %xmm12 1216; SSE-NEXT: pandn %xmm1, %xmm2 1217; SSE-NEXT: por %xmm12, %xmm2 1218; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1219; SSE-NEXT: movdqa %xmm5, %xmm12 1220; SSE-NEXT: movdqa %xmm5, %xmm13 1221; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] 1222; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 1223; SSE-NEXT: psrlq $48, %xmm0 1224; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] 1225; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] 1226; SSE-NEXT: movdqa %xmm1, %xmm14 1227; SSE-NEXT: pandn %xmm12, %xmm14 1228; SSE-NEXT: movdqa %xmm10, %xmm15 1229; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 1230; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5,7,6] 1231; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,3,3] 1232; SSE-NEXT: pand %xmm1, %xmm0 1233; SSE-NEXT: por %xmm14, %xmm0 1234; SSE-NEXT: pand %xmm9, %xmm0 1235; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1236; SSE-NEXT: movdqa %xmm9, %xmm7 1237; SSE-NEXT: pandn %xmm2, %xmm7 1238; SSE-NEXT: por %xmm0, %xmm7 1239; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1240; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 1241; SSE-NEXT: movdqa %xmm1, %xmm0 1242; SSE-NEXT: pandn %xmm15, %xmm0 1243; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] 1244; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] 1245; SSE-NEXT: pand %xmm1, %xmm13 1246; SSE-NEXT: por %xmm0, %xmm13 1247; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] 1248; SSE-NEXT: movdqa %xmm15, %xmm14 1249; SSE-NEXT: pandn %xmm2, %xmm14 1250; SSE-NEXT: pand %xmm15, %xmm13 1251; SSE-NEXT: por %xmm13, %xmm14 1252; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,6,6] 1253; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1254; SSE-NEXT: movdqa %xmm11, %xmm2 1255; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] 1256; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] 1257; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 1258; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1259; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1260; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] 1261; SSE-NEXT: pand %xmm13, %xmm2 1262; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] 1263; SSE-NEXT: movdqa %xmm13, %xmm12 1264; SSE-NEXT: pandn %xmm0, %xmm12 1265; SSE-NEXT: por %xmm2, %xmm12 1266; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 1267; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,2,2,4,5,6,7] 1268; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 1269; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] 1270; SSE-NEXT: movdqa %xmm10, %xmm11 1271; SSE-NEXT: pandn %xmm2, %xmm11 1272; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,3,2,4,5,6,7] 1273; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] 1274; SSE-NEXT: pand %xmm10, %xmm2 1275; SSE-NEXT: por %xmm11, %xmm2 1276; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] 1277; SSE-NEXT: movdqa %xmm3, %xmm7 1278; SSE-NEXT: pandn %xmm0, %xmm7 1279; SSE-NEXT: pand %xmm3, %xmm2 1280; SSE-NEXT: por %xmm2, %xmm7 1281; SSE-NEXT: movdqa %xmm6, %xmm11 1282; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 1283; SSE-NEXT: movdqa %xmm6, %xmm0 1284; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 1285; SSE-NEXT: psrlq $48, %xmm8 1286; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] 1287; SSE-NEXT: movdqa %xmm1, %xmm2 1288; SSE-NEXT: pandn %xmm0, %xmm2 1289; SSE-NEXT: movdqa %xmm4, %xmm0 1290; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1291; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1292; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,7,6] 1293; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] 1294; SSE-NEXT: pand %xmm1, %xmm8 1295; SSE-NEXT: por %xmm2, %xmm8 1296; SSE-NEXT: pand %xmm9, %xmm8 1297; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1298; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] 1299; SSE-NEXT: pandn %xmm2, %xmm9 1300; SSE-NEXT: por %xmm8, %xmm9 1301; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 1302; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,2,3,3,4,5,6,7] 1303; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] 1304; SSE-NEXT: pand %xmm1, %xmm8 1305; SSE-NEXT: pandn %xmm0, %xmm1 1306; SSE-NEXT: por %xmm8, %xmm1 1307; SSE-NEXT: pand %xmm15, %xmm1 1308; SSE-NEXT: pandn %xmm2, %xmm15 1309; SSE-NEXT: por %xmm1, %xmm15 1310; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] 1311; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1312; SSE-NEXT: movdqa %xmm3, %xmm1 1313; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1314; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] 1315; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] 1316; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1317; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1318; SSE-NEXT: pand %xmm13, %xmm1 1319; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] 1320; SSE-NEXT: pandn %xmm0, %xmm13 1321; SSE-NEXT: por %xmm1, %xmm13 1322; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1323; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] 1324; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 1325; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] 1326; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] 1327; SSE-NEXT: pand %xmm10, %xmm2 1328; SSE-NEXT: pandn %xmm1, %xmm10 1329; SSE-NEXT: por %xmm2, %xmm10 1330; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 1331; SSE-NEXT: pand %xmm1, %xmm10 1332; SSE-NEXT: pandn %xmm0, %xmm1 1333; SSE-NEXT: por %xmm10, %xmm1 1334; SSE-NEXT: movdqa %xmm1, (%r9) 1335; SSE-NEXT: movdqa %xmm13, 16(%r9) 1336; SSE-NEXT: movdqa %xmm15, 48(%r9) 1337; SSE-NEXT: movdqa %xmm9, 64(%r9) 1338; SSE-NEXT: movdqa %xmm7, 80(%r9) 1339; SSE-NEXT: movdqa %xmm12, 96(%r9) 1340; SSE-NEXT: movdqa %xmm14, 128(%r9) 1341; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1342; SSE-NEXT: movaps %xmm0, 144(%r9) 1343; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1344; SSE-NEXT: movaps %xmm0, 32(%r9) 1345; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1346; SSE-NEXT: movaps %xmm0, 112(%r9) 1347; SSE-NEXT: retq 1348; 1349; AVX-LABEL: store_i16_stride5_vf16: 1350; AVX: # %bb.0: 1351; AVX-NEXT: vmovdqa (%rcx), %xmm2 1352; AVX-NEXT: vmovdqa 16(%rcx), %xmm9 1353; AVX-NEXT: vmovdqa (%rdx), %xmm5 1354; AVX-NEXT: vmovdqa 16(%rdx), %xmm10 1355; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 1356; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 1357; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 1358; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 1359; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1360; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 1361; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0 1362; AVX-NEXT: vmovdqa (%rdi), %xmm3 1363; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 1364; AVX-NEXT: vmovdqa (%rsi), %xmm4 1365; AVX-NEXT: vmovdqa 16(%rsi), %xmm12 1366; AVX-NEXT: vpsrlq $48, %xmm12, %xmm1 1367; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] 1368; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 1369; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] 1370; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] 1371; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 1372; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 1373; AVX-NEXT: vorps %ymm0, %ymm1, %ymm1 1374; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 1375; AVX-NEXT: vmovdqa 16(%r8), %xmm11 1376; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,3,2,3] 1377; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] 1378; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1379; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2,3,4],xmm7[5],xmm1[6,7] 1380; AVX-NEXT: vpsrlq $48, %xmm4, %xmm7 1381; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] 1382; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] 1383; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] 1384; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] 1385; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 1386; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 1387; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,7,6] 1388; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] 1389; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1390; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 1391; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1392; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 1393; AVX-NEXT: vandnps %ymm14, %ymm6, %ymm14 1394; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 1395; AVX-NEXT: vorps %ymm0, %ymm14, %ymm6 1396; AVX-NEXT: vextractf128 $1, %ymm6, %xmm0 1397; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] 1398; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] 1399; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5,6,7] 1400; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] 1401; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] 1402; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 1403; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] 1404; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] 1405; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] 1406; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm13[2],xmm8[3,4,5,6],xmm13[7] 1407; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] 1408; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] 1409; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 1410; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 1411; AVX-NEXT: vandnps %ymm12, %ymm15, %ymm9 1412; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1413; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 1414; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 1415; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 1416; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0 1417; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] 1418; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1419; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] 1420; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 1421; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 1422; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1423; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 1424; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] 1425; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] 1426; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 1427; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1428; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] 1429; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] 1430; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] 1431; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 1432; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 1433; AVX-NEXT: vmovdqa (%r8), %xmm11 1434; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 1435; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12 1436; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 1437; AVX-NEXT: vextractf128 $1, %ymm0, %xmm12 1438; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,0,1] 1439; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] 1440; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] 1441; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] 1442; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] 1443; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] 1444; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] 1445; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 1446; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 1447; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] 1448; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 1449; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6,7] 1450; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1451; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] 1452; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] 1453; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1454; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 1455; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 1456; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 1457; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 1458; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] 1459; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm3[2],xmm6[3,4,5,6],xmm3[7] 1460; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm11[3],xmm2[4,5,6,7] 1461; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 1462; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] 1463; AVX-NEXT: vmovdqa %xmm2, 48(%r9) 1464; AVX-NEXT: vmovdqa %xmm5, 32(%r9) 1465; AVX-NEXT: vmovdqa %xmm0, (%r9) 1466; AVX-NEXT: vmovdqa %xmm12, 16(%r9) 1467; AVX-NEXT: vmovdqa %xmm10, 112(%r9) 1468; AVX-NEXT: vmovdqa %xmm9, 96(%r9) 1469; AVX-NEXT: vmovdqa %xmm4, 64(%r9) 1470; AVX-NEXT: vmovdqa %xmm8, 80(%r9) 1471; AVX-NEXT: vmovdqa %xmm1, 128(%r9) 1472; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1473; AVX-NEXT: vmovaps %xmm0, 144(%r9) 1474; AVX-NEXT: vzeroupper 1475; AVX-NEXT: retq 1476; 1477; AVX2-LABEL: store_i16_stride5_vf16: 1478; AVX2: # %bb.0: 1479; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1480; AVX2-NEXT: vmovdqa (%rsi), %ymm2 1481; AVX2-NEXT: vmovdqa (%rdx), %ymm3 1482; AVX2-NEXT: vmovdqa (%rcx), %ymm4 1483; AVX2-NEXT: vmovdqa (%r8), %ymm1 1484; AVX2-NEXT: vmovdqa (%rdx), %xmm7 1485; AVX2-NEXT: vmovdqa (%rcx), %xmm8 1486; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1487; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1488; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1489; AVX2-NEXT: vmovdqa (%rsi), %xmm9 1490; AVX2-NEXT: vmovdqa (%rdi), %xmm6 1491; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] 1492; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] 1493; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] 1494; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1495; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1496; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm5, %ymm5 1497; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 1498; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 1499; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 1500; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,2,5,5,5,6] 1501; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1502; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] 1503; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 1504; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,2,1,4,5,6,5] 1505; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 1506; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 1507; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] 1508; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 1509; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1510; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 1511; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm10 1512; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1513; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 1514; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm10 1515; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1516; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] 1517; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 1518; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] 1519; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1520; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] 1521; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] 1522; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1523; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 1524; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] 1525; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 1526; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1527; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] 1528; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 1529; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] 1530; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] 1531; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] 1532; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] 1533; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 1534; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] 1535; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] 1536; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 1537; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 1538; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1539; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 1540; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1541; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1542; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 1543; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 1544; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 1545; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 1546; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] 1547; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 1548; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1549; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 1550; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] 1551; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 1552; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1553; AVX2-NEXT: vmovdqa %ymm0, 64(%r9) 1554; AVX2-NEXT: vmovdqa %ymm8, 128(%r9) 1555; AVX2-NEXT: vmovdqa %ymm7, 32(%r9) 1556; AVX2-NEXT: vmovdqa %ymm6, 96(%r9) 1557; AVX2-NEXT: vmovdqa %ymm5, (%r9) 1558; AVX2-NEXT: vzeroupper 1559; AVX2-NEXT: retq 1560; 1561; AVX2-FP-LABEL: store_i16_stride5_vf16: 1562; AVX2-FP: # %bb.0: 1563; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1564; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 1565; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 1566; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 1567; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 1568; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 1569; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 1570; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 1571; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 1572; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1573; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm7 1574; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 1575; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1576; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1577; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 1578; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1579; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 1580; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm9 1581; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 1582; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 1583; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm9 1584; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1585; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] 1586; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1587; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] 1588; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1589; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] 1590; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] 1591; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1592; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 1593; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] 1594; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 1595; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 1596; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] 1597; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1598; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] 1599; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 1600; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] 1601; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] 1602; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] 1603; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 1604; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1605; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1606; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm8 1607; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1608; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1609; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] 1610; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] 1611; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] 1612; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] 1613; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] 1614; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] 1615; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] 1616; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 1617; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 1618; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1619; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm9 1620; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1621; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1622; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 1623; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 1624; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 1625; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 1626; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] 1627; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 1628; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1629; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 1630; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] 1631; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 1632; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1633; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) 1634; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9) 1635; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%r9) 1636; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) 1637; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) 1638; AVX2-FP-NEXT: vzeroupper 1639; AVX2-FP-NEXT: retq 1640; 1641; AVX2-FCP-LABEL: store_i16_stride5_vf16: 1642; AVX2-FCP: # %bb.0: 1643; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1644; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 1645; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 1646; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 1647; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 1648; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 1649; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 1650; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 1651; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 1652; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1653; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 1654; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 1655; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1656; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1657; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 1658; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1659; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 1660; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm9 1661; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 1662; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 1663; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm9 1664; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1665; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] 1666; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1667; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] 1668; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1669; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] 1670; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] 1671; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1672; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 1673; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] 1674; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 1675; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 1676; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] 1677; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1678; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] 1679; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 1680; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] 1681; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] 1682; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] 1683; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 1684; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 1685; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1686; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm8 1687; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1688; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1689; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] 1690; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] 1691; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] 1692; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] 1693; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] 1694; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] 1695; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] 1696; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 1697; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 1698; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1699; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm9 1700; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1701; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 1702; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 1703; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 1704; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 1705; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 1706; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] 1707; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 1708; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 1709; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 1710; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] 1711; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 1712; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1713; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) 1714; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9) 1715; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9) 1716; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) 1717; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) 1718; AVX2-FCP-NEXT: vzeroupper 1719; AVX2-FCP-NEXT: retq 1720; 1721; AVX512-LABEL: store_i16_stride5_vf16: 1722; AVX512: # %bb.0: 1723; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1724; AVX512-NEXT: vmovdqa (%rsi), %ymm1 1725; AVX512-NEXT: vmovdqa (%rdx), %ymm2 1726; AVX512-NEXT: vmovdqa (%rcx), %ymm3 1727; AVX512-NEXT: vmovdqa (%r8), %ymm4 1728; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm5 1729; AVX512-NEXT: vmovdqa (%rsi), %xmm6 1730; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1731; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] 1732; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1733; AVX512-NEXT: vmovdqa (%rdx), %xmm7 1734; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] 1735; AVX512-NEXT: vmovdqa (%rcx), %xmm9 1736; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1737; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] 1738; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] 1739; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) 1740; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 1741; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 1742; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1743; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 1744; AVX512-NEXT: vmovdqa (%rdi), %xmm8 1745; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 1746; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] 1747; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] 1748; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1749; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 1750; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 1751; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] 1752; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 1753; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] 1754; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 1755; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 1756; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] 1757; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] 1758; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] 1759; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] 1760; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] 1761; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 1762; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 1763; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) 1764; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] 1765; AVX512-NEXT: vprolq $16, %ymm1, %ymm10 1766; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 1767; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 1768; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] 1769; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1770; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 1771; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 1772; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) 1773; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 1774; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] 1775; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1776; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 1777; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero 1778; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 1779; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) 1780; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 1781; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 1782; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 1783; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 1784; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 1785; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1786; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 1787; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] 1788; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1789; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 1790; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2)) 1791; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1792; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) 1793; AVX512-NEXT: vmovdqa %ymm1, 128(%r9) 1794; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r9) 1795; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) 1796; AVX512-NEXT: vzeroupper 1797; AVX512-NEXT: retq 1798; 1799; AVX512-FCP-LABEL: store_i16_stride5_vf16: 1800; AVX512-FCP: # %bb.0: 1801; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1802; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 1803; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 1804; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3 1805; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 1806; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm5 1807; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6 1808; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1809; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] 1810; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1811; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 1812; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] 1813; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9 1814; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1815; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] 1816; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] 1817; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) 1818; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 1819; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 1820; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1821; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 1822; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 1823; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 1824; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 1825; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1826; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 1827; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 1828; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] 1829; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 1830; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] 1831; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 1832; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 1833; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] 1834; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] 1835; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] 1836; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] 1837; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] 1838; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 1839; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 1840; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) 1841; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] 1842; AVX512-FCP-NEXT: vprolq $16, %ymm1, %ymm10 1843; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 1844; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 1845; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] 1846; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1847; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 1848; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 1849; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) 1850; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 1851; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] 1852; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1853; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 1854; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero 1855; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 1856; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) 1857; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 1858; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] 1859; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 1860; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 1861; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1862; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] 1863; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1864; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 1865; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2)) 1866; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1867; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) 1868; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) 1869; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) 1870; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) 1871; AVX512-FCP-NEXT: vzeroupper 1872; AVX512-FCP-NEXT: retq 1873; 1874; AVX512DQ-LABEL: store_i16_stride5_vf16: 1875; AVX512DQ: # %bb.0: 1876; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1877; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 1878; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 1879; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm3 1880; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 1881; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm5 1882; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm6 1883; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1884; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] 1885; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1886; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 1887; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] 1888; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm9 1889; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1890; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] 1891; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] 1892; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) 1893; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 1894; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 1895; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1896; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 1897; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 1898; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 1899; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] 1900; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] 1901; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1902; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 1903; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 1904; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] 1905; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 1906; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] 1907; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 1908; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 1909; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] 1910; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] 1911; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] 1912; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] 1913; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] 1914; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 1915; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 1916; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) 1917; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] 1918; AVX512DQ-NEXT: vprolq $16, %ymm1, %ymm10 1919; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 1920; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 1921; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] 1922; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 1923; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 1924; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 1925; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) 1926; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 1927; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] 1928; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 1929; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 1930; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero 1931; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 1932; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) 1933; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 1934; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 1935; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 1936; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 1937; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 1938; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 1939; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 1940; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] 1941; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1942; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 1943; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2)) 1944; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 1945; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) 1946; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%r9) 1947; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r9) 1948; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) 1949; AVX512DQ-NEXT: vzeroupper 1950; AVX512DQ-NEXT: retq 1951; 1952; AVX512DQ-FCP-LABEL: store_i16_stride5_vf16: 1953; AVX512DQ-FCP: # %bb.0: 1954; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1955; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 1956; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 1957; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 1958; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 1959; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm5 1960; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm6 1961; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 1962; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] 1963; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 1964; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 1965; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] 1966; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm9 1967; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 1968; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] 1969; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] 1970; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) 1971; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 1972; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] 1973; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 1974; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 1975; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 1976; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 1977; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 1978; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 1979; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 1980; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 1981; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] 1982; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 1983; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] 1984; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 1985; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 1986; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] 1987; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] 1988; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] 1989; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] 1990; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] 1991; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 1992; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 1993; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) 1994; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] 1995; AVX512DQ-FCP-NEXT: vprolq $16, %ymm1, %ymm10 1996; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 1997; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 1998; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] 1999; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] 2000; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 2001; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 2002; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) 2003; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 2004; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] 2005; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 2006; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 2007; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero 2008; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 2009; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) 2010; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 2011; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] 2012; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 2013; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 2014; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 2015; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] 2016; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 2017; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 2018; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2)) 2019; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 2020; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) 2021; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) 2022; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) 2023; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) 2024; AVX512DQ-FCP-NEXT: vzeroupper 2025; AVX512DQ-FCP-NEXT: retq 2026; 2027; AVX512BW-LABEL: store_i16_stride5_vf16: 2028; AVX512BW: # %bb.0: 2029; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 2030; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 2031; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 2032; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2033; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2034; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] 2035; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2036; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] 2037; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 2038; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 2039; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 2040; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] 2041; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 2042; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 2043; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 2044; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) 2045; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) 2046; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] 2047; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 2048; AVX512BW-NEXT: vmovdqa %ymm0, 128(%r9) 2049; AVX512BW-NEXT: vzeroupper 2050; AVX512BW-NEXT: retq 2051; 2052; AVX512BW-FCP-LABEL: store_i16_stride5_vf16: 2053; AVX512BW-FCP: # %bb.0: 2054; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 2055; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 2056; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 2057; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2058; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2059; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] 2060; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2061; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] 2062; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 2063; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 2064; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 2065; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] 2066; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 2067; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 2068; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 2069; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 2070; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) 2071; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] 2072; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 2073; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) 2074; AVX512BW-FCP-NEXT: vzeroupper 2075; AVX512BW-FCP-NEXT: retq 2076; 2077; AVX512DQ-BW-LABEL: store_i16_stride5_vf16: 2078; AVX512DQ-BW: # %bb.0: 2079; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 2080; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 2081; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 2082; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2083; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2084; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] 2085; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2086; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] 2087; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 2088; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 2089; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 2090; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] 2091; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 2092; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 2093; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 2094; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) 2095; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) 2096; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] 2097; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 2098; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 128(%r9) 2099; AVX512DQ-BW-NEXT: vzeroupper 2100; AVX512DQ-BW-NEXT: retq 2101; 2102; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf16: 2103; AVX512DQ-BW-FCP: # %bb.0: 2104; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 2105; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 2106; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 2107; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 2108; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 2109; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] 2110; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 2111; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] 2112; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 2113; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 2114; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 2115; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] 2116; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 2117; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 2118; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 2119; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 2120; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) 2121; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] 2122; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 2123; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) 2124; AVX512DQ-BW-FCP-NEXT: vzeroupper 2125; AVX512DQ-BW-FCP-NEXT: retq 2126 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 2127 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64 2128 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 64 2129 %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 64 2130 %in.vec4 = load <16 x i16>, ptr %in.vecptr4, align 64 2131 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2132 %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2133 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2134 %4 = shufflevector <16 x i16> %in.vec4, <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2135 %5 = shufflevector <64 x i16> %3, <64 x i16> %4, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79> 2136 %interleaved.vec = shufflevector <80 x i16> %5, <80 x i16> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79> 2137 store <80 x i16> %interleaved.vec, ptr %out.vec, align 64 2138 ret void 2139} 2140 2141define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 2142; SSE-LABEL: store_i16_stride5_vf32: 2143; SSE: # %bb.0: 2144; SSE-NEXT: subq $248, %rsp 2145; SSE-NEXT: movdqa (%rdi), %xmm5 2146; SSE-NEXT: movdqa 16(%rdi), %xmm9 2147; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2148; SSE-NEXT: movdqa (%rsi), %xmm6 2149; SSE-NEXT: movdqa 16(%rsi), %xmm12 2150; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2151; SSE-NEXT: movdqa (%rdx), %xmm2 2152; SSE-NEXT: movdqa (%rcx), %xmm8 2153; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2154; SSE-NEXT: movdqa 16(%rcx), %xmm14 2155; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2156; SSE-NEXT: movdqa (%r8), %xmm0 2157; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2158; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535] 2159; SSE-NEXT: movdqa %xmm15, %xmm1 2160; SSE-NEXT: pandn %xmm5, %xmm1 2161; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] 2162; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 2163; SSE-NEXT: pand %xmm15, %xmm3 2164; SSE-NEXT: por %xmm1, %xmm3 2165; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] 2166; SSE-NEXT: movdqa %xmm1, %xmm4 2167; SSE-NEXT: pandn %xmm3, %xmm4 2168; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] 2169; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,65535,65535,0] 2170; SSE-NEXT: pand %xmm13, %xmm7 2171; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] 2172; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 2173; SSE-NEXT: movdqa %xmm13, %xmm11 2174; SSE-NEXT: pandn %xmm8, %xmm11 2175; SSE-NEXT: por %xmm7, %xmm11 2176; SSE-NEXT: pand %xmm1, %xmm11 2177; SSE-NEXT: por %xmm4, %xmm11 2178; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] 2179; SSE-NEXT: pand %xmm10, %xmm11 2180; SSE-NEXT: movdqa %xmm10, %xmm3 2181; SSE-NEXT: pandn %xmm0, %xmm3 2182; SSE-NEXT: por %xmm11, %xmm3 2183; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2184; SSE-NEXT: movdqa %xmm15, %xmm4 2185; SSE-NEXT: pandn %xmm9, %xmm4 2186; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] 2187; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 2188; SSE-NEXT: pand %xmm15, %xmm7 2189; SSE-NEXT: por %xmm4, %xmm7 2190; SSE-NEXT: movdqa %xmm1, %xmm8 2191; SSE-NEXT: pandn %xmm7, %xmm8 2192; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] 2193; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2194; SSE-NEXT: movdqa %xmm13, %xmm7 2195; SSE-NEXT: pandn %xmm4, %xmm7 2196; SSE-NEXT: movdqa 16(%rdx), %xmm14 2197; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] 2198; SSE-NEXT: pand %xmm13, %xmm11 2199; SSE-NEXT: por %xmm11, %xmm7 2200; SSE-NEXT: pand %xmm1, %xmm7 2201; SSE-NEXT: por %xmm8, %xmm7 2202; SSE-NEXT: movdqa 16(%r8), %xmm4 2203; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2204; SSE-NEXT: pand %xmm10, %xmm7 2205; SSE-NEXT: movdqa %xmm10, %xmm3 2206; SSE-NEXT: pandn %xmm4, %xmm3 2207; SSE-NEXT: por %xmm7, %xmm3 2208; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2209; SSE-NEXT: movdqa 32(%rdi), %xmm3 2210; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2211; SSE-NEXT: movdqa %xmm15, %xmm7 2212; SSE-NEXT: pandn %xmm3, %xmm7 2213; SSE-NEXT: movdqa 32(%rsi), %xmm3 2214; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2215; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] 2216; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 2217; SSE-NEXT: pand %xmm15, %xmm8 2218; SSE-NEXT: por %xmm7, %xmm8 2219; SSE-NEXT: movdqa %xmm1, %xmm7 2220; SSE-NEXT: pandn %xmm8, %xmm7 2221; SSE-NEXT: movdqa 32(%rcx), %xmm3 2222; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2223; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] 2224; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 2225; SSE-NEXT: movdqa %xmm13, %xmm12 2226; SSE-NEXT: pandn %xmm8, %xmm12 2227; SSE-NEXT: movdqa 32(%rdx), %xmm11 2228; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] 2229; SSE-NEXT: pand %xmm13, %xmm8 2230; SSE-NEXT: por %xmm8, %xmm12 2231; SSE-NEXT: pand %xmm1, %xmm12 2232; SSE-NEXT: por %xmm7, %xmm12 2233; SSE-NEXT: pand %xmm10, %xmm12 2234; SSE-NEXT: movdqa 32(%r8), %xmm7 2235; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill 2236; SSE-NEXT: movdqa %xmm10, %xmm3 2237; SSE-NEXT: pandn %xmm7, %xmm3 2238; SSE-NEXT: por %xmm12, %xmm3 2239; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2240; SSE-NEXT: movdqa 48(%rdi), %xmm3 2241; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2242; SSE-NEXT: movdqa %xmm15, %xmm7 2243; SSE-NEXT: pandn %xmm3, %xmm7 2244; SSE-NEXT: movdqa 48(%rsi), %xmm3 2245; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2246; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] 2247; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 2248; SSE-NEXT: pand %xmm15, %xmm8 2249; SSE-NEXT: por %xmm7, %xmm8 2250; SSE-NEXT: movdqa 48(%rcx), %xmm3 2251; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2252; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] 2253; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 2254; SSE-NEXT: movdqa %xmm13, %xmm12 2255; SSE-NEXT: pandn %xmm7, %xmm12 2256; SSE-NEXT: movdqa 48(%rdx), %xmm3 2257; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2258; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] 2259; SSE-NEXT: pand %xmm13, %xmm7 2260; SSE-NEXT: por %xmm7, %xmm12 2261; SSE-NEXT: pand %xmm1, %xmm12 2262; SSE-NEXT: pandn %xmm8, %xmm1 2263; SSE-NEXT: por %xmm12, %xmm1 2264; SSE-NEXT: pand %xmm10, %xmm1 2265; SSE-NEXT: movdqa 48(%r8), %xmm3 2266; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2267; SSE-NEXT: pandn %xmm3, %xmm10 2268; SSE-NEXT: por %xmm1, %xmm10 2269; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2270; SSE-NEXT: movdqa %xmm2, %xmm1 2271; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2272; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2273; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 2274; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1] 2275; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] 2276; SSE-NEXT: movdqa %xmm3, %xmm8 2277; SSE-NEXT: pandn %xmm7, %xmm8 2278; SSE-NEXT: movdqa %xmm5, %xmm7 2279; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 2280; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7] 2281; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] 2282; SSE-NEXT: pand %xmm3, %xmm12 2283; SSE-NEXT: por %xmm8, %xmm12 2284; SSE-NEXT: pand %xmm15, %xmm12 2285; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2286; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] 2287; SSE-NEXT: movdqa %xmm15, %xmm1 2288; SSE-NEXT: pandn %xmm8, %xmm1 2289; SSE-NEXT: por %xmm12, %xmm1 2290; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2291; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] 2292; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] 2293; SSE-NEXT: movdqa %xmm0, %xmm12 2294; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] 2295; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] 2296; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] 2297; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] 2298; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] 2299; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] 2300; SSE-NEXT: movdqa %xmm9, %xmm1 2301; SSE-NEXT: pandn %xmm8, %xmm1 2302; SSE-NEXT: pand %xmm9, %xmm12 2303; SSE-NEXT: por %xmm12, %xmm1 2304; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2305; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2306; SSE-NEXT: movdqa %xmm2, %xmm7 2307; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2308; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] 2309; SSE-NEXT: movdqa %xmm0, %xmm8 2310; SSE-NEXT: pandn %xmm7, %xmm8 2311; SSE-NEXT: movdqa %xmm5, %xmm7 2312; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2313; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] 2314; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2] 2315; SSE-NEXT: pand %xmm0, %xmm12 2316; SSE-NEXT: por %xmm8, %xmm12 2317; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,0,65535,65535] 2318; SSE-NEXT: pand %xmm10, %xmm12 2319; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] 2320; SSE-NEXT: movdqa %xmm10, %xmm1 2321; SSE-NEXT: pandn %xmm8, %xmm1 2322; SSE-NEXT: por %xmm12, %xmm1 2323; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2324; SSE-NEXT: psrlq $48, %xmm6 2325; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] 2326; SSE-NEXT: movdqa %xmm0, %xmm6 2327; SSE-NEXT: pandn %xmm5, %xmm6 2328; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] 2329; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 2330; SSE-NEXT: pand %xmm0, %xmm2 2331; SSE-NEXT: por %xmm6, %xmm2 2332; SSE-NEXT: movdqa %xmm13, %xmm1 2333; SSE-NEXT: pandn %xmm8, %xmm1 2334; SSE-NEXT: pand %xmm13, %xmm2 2335; SSE-NEXT: por %xmm2, %xmm1 2336; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2337; SSE-NEXT: movdqa %xmm14, %xmm2 2338; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2339; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 2340; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] 2341; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 2342; SSE-NEXT: movdqa %xmm3, %xmm5 2343; SSE-NEXT: pandn %xmm2, %xmm5 2344; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2345; SSE-NEXT: movdqa %xmm1, %xmm2 2346; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2347; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] 2348; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] 2349; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] 2350; SSE-NEXT: pand %xmm3, %xmm6 2351; SSE-NEXT: por %xmm5, %xmm6 2352; SSE-NEXT: pand %xmm15, %xmm6 2353; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2354; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] 2355; SSE-NEXT: movdqa %xmm15, %xmm12 2356; SSE-NEXT: pandn %xmm5, %xmm12 2357; SSE-NEXT: por %xmm6, %xmm12 2358; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2359; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] 2360; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2361; SSE-NEXT: movdqa %xmm8, %xmm6 2362; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] 2363; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] 2364; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] 2365; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 2366; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] 2367; SSE-NEXT: movdqa %xmm9, %xmm2 2368; SSE-NEXT: pandn %xmm5, %xmm2 2369; SSE-NEXT: pand %xmm9, %xmm6 2370; SSE-NEXT: por %xmm6, %xmm2 2371; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2372; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] 2373; SSE-NEXT: movdqa %xmm14, %xmm2 2374; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2375; SSE-NEXT: movdqa %xmm0, %xmm5 2376; SSE-NEXT: pandn %xmm2, %xmm5 2377; SSE-NEXT: movdqa %xmm1, %xmm2 2378; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 2379; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] 2380; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] 2381; SSE-NEXT: pand %xmm0, %xmm2 2382; SSE-NEXT: por %xmm5, %xmm2 2383; SSE-NEXT: pand %xmm10, %xmm2 2384; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 2385; SSE-NEXT: movdqa %xmm10, %xmm4 2386; SSE-NEXT: pandn %xmm5, %xmm4 2387; SSE-NEXT: por %xmm2, %xmm4 2388; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2389; SSE-NEXT: movdqa %xmm7, %xmm2 2390; SSE-NEXT: psrlq $48, %xmm2 2391; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 2392; SSE-NEXT: movdqa %xmm0, %xmm2 2393; SSE-NEXT: pandn %xmm1, %xmm2 2394; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,5,7,6] 2395; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] 2396; SSE-NEXT: pand %xmm0, %xmm4 2397; SSE-NEXT: por %xmm2, %xmm4 2398; SSE-NEXT: movdqa %xmm13, %xmm1 2399; SSE-NEXT: pandn %xmm5, %xmm1 2400; SSE-NEXT: pand %xmm13, %xmm4 2401; SSE-NEXT: por %xmm4, %xmm1 2402; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2403; SSE-NEXT: movdqa %xmm11, %xmm2 2404; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2405; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 2406; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] 2407; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 2408; SSE-NEXT: movdqa %xmm3, %xmm4 2409; SSE-NEXT: pandn %xmm2, %xmm4 2410; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2411; SSE-NEXT: movdqa %xmm7, %xmm2 2412; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2413; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 2414; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] 2415; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] 2416; SSE-NEXT: pand %xmm3, %xmm5 2417; SSE-NEXT: por %xmm4, %xmm5 2418; SSE-NEXT: pand %xmm15, %xmm5 2419; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload 2420; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] 2421; SSE-NEXT: movdqa %xmm15, %xmm4 2422; SSE-NEXT: pandn %xmm1, %xmm4 2423; SSE-NEXT: por %xmm5, %xmm4 2424; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2425; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] 2426; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2427; SSE-NEXT: movdqa %xmm8, %xmm5 2428; SSE-NEXT: movdqa %xmm8, %xmm4 2429; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] 2430; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 2431; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 2432; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 2433; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 2434; SSE-NEXT: movdqa %xmm9, %xmm8 2435; SSE-NEXT: pandn %xmm1, %xmm8 2436; SSE-NEXT: pand %xmm9, %xmm5 2437; SSE-NEXT: por %xmm5, %xmm8 2438; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] 2439; SSE-NEXT: movdqa %xmm11, %xmm1 2440; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2441; SSE-NEXT: movdqa %xmm0, %xmm5 2442; SSE-NEXT: pandn %xmm1, %xmm5 2443; SSE-NEXT: movdqa %xmm7, %xmm4 2444; SSE-NEXT: movdqa %xmm7, %xmm1 2445; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 2446; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] 2447; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 2448; SSE-NEXT: pand %xmm0, %xmm1 2449; SSE-NEXT: por %xmm5, %xmm1 2450; SSE-NEXT: pand %xmm10, %xmm1 2451; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] 2452; SSE-NEXT: movdqa %xmm10, %xmm9 2453; SSE-NEXT: pandn %xmm2, %xmm9 2454; SSE-NEXT: por %xmm1, %xmm9 2455; SSE-NEXT: movdqa %xmm6, %xmm1 2456; SSE-NEXT: psrlq $48, %xmm1 2457; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] 2458; SSE-NEXT: movdqa %xmm0, %xmm1 2459; SSE-NEXT: pandn %xmm4, %xmm1 2460; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,6] 2461; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] 2462; SSE-NEXT: pand %xmm0, %xmm11 2463; SSE-NEXT: por %xmm1, %xmm11 2464; SSE-NEXT: movdqa %xmm13, %xmm14 2465; SSE-NEXT: pandn %xmm2, %xmm14 2466; SSE-NEXT: pand %xmm13, %xmm11 2467; SSE-NEXT: por %xmm11, %xmm14 2468; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2469; SSE-NEXT: movdqa %xmm1, %xmm2 2470; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2471; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 2472; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] 2473; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 2474; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2475; SSE-NEXT: movdqa %xmm7, %xmm11 2476; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2477; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] 2478; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] 2479; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] 2480; SSE-NEXT: pand %xmm3, %xmm12 2481; SSE-NEXT: pandn %xmm2, %xmm3 2482; SSE-NEXT: por %xmm12, %xmm3 2483; SSE-NEXT: pand %xmm15, %xmm3 2484; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2485; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] 2486; SSE-NEXT: pandn %xmm2, %xmm15 2487; SSE-NEXT: por %xmm3, %xmm15 2488; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] 2489; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 2490; SSE-NEXT: movdqa %xmm6, %xmm11 2491; SSE-NEXT: movdqa %xmm1, %xmm12 2492; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] 2493; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] 2494; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] 2495; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 2496; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] 2497; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] 2498; SSE-NEXT: pand %xmm1, %xmm11 2499; SSE-NEXT: pandn %xmm2, %xmm1 2500; SSE-NEXT: por %xmm11, %xmm1 2501; SSE-NEXT: movdqa %xmm12, %xmm3 2502; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 2503; SSE-NEXT: movdqa %xmm3, %xmm2 2504; SSE-NEXT: movdqa %xmm3, %xmm6 2505; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2506; SSE-NEXT: movdqa %xmm0, %xmm3 2507; SSE-NEXT: pandn %xmm2, %xmm3 2508; SSE-NEXT: movdqa %xmm7, %xmm2 2509; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 2510; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] 2511; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] 2512; SSE-NEXT: pand %xmm0, %xmm2 2513; SSE-NEXT: por %xmm3, %xmm2 2514; SSE-NEXT: pand %xmm10, %xmm2 2515; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] 2516; SSE-NEXT: pandn %xmm3, %xmm10 2517; SSE-NEXT: por %xmm2, %xmm10 2518; SSE-NEXT: psrlq $48, %xmm5 2519; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm5[1] 2520; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] 2521; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 2522; SSE-NEXT: pand %xmm0, %xmm2 2523; SSE-NEXT: pandn %xmm7, %xmm0 2524; SSE-NEXT: por %xmm2, %xmm0 2525; SSE-NEXT: pand %xmm13, %xmm0 2526; SSE-NEXT: pandn %xmm3, %xmm13 2527; SSE-NEXT: por %xmm0, %xmm13 2528; SSE-NEXT: movdqa %xmm13, 304(%r9) 2529; SSE-NEXT: movdqa %xmm10, 288(%r9) 2530; SSE-NEXT: movdqa %xmm1, 256(%r9) 2531; SSE-NEXT: movdqa %xmm15, 240(%r9) 2532; SSE-NEXT: movdqa %xmm14, 224(%r9) 2533; SSE-NEXT: movdqa %xmm9, 208(%r9) 2534; SSE-NEXT: movdqa %xmm8, 176(%r9) 2535; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2536; SSE-NEXT: movaps %xmm0, 160(%r9) 2537; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2538; SSE-NEXT: movaps %xmm0, 144(%r9) 2539; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2540; SSE-NEXT: movaps %xmm0, 128(%r9) 2541; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2542; SSE-NEXT: movaps %xmm0, 96(%r9) 2543; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2544; SSE-NEXT: movaps %xmm0, 80(%r9) 2545; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2546; SSE-NEXT: movaps %xmm0, 64(%r9) 2547; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2548; SSE-NEXT: movaps %xmm0, 48(%r9) 2549; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2550; SSE-NEXT: movaps %xmm0, 16(%r9) 2551; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2552; SSE-NEXT: movaps %xmm0, (%r9) 2553; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2554; SSE-NEXT: movaps %xmm0, 272(%r9) 2555; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2556; SSE-NEXT: movaps %xmm0, 192(%r9) 2557; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2558; SSE-NEXT: movaps %xmm0, 112(%r9) 2559; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2560; SSE-NEXT: movaps %xmm0, 32(%r9) 2561; SSE-NEXT: addq $248, %rsp 2562; SSE-NEXT: retq 2563; 2564; AVX-LABEL: store_i16_stride5_vf32: 2565; AVX: # %bb.0: 2566; AVX-NEXT: subq $56, %rsp 2567; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 2568; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 2569; AVX-NEXT: vmovdqa 32(%rsi), %xmm11 2570; AVX-NEXT: vmovdqa 48(%rsi), %xmm6 2571; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] 2572; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 2573; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] 2574; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] 2575; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] 2576; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2577; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2578; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 2579; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm2 2580; AVX-NEXT: vmovdqa 32(%rdx), %xmm12 2581; AVX-NEXT: vmovdqa 48(%rdx), %xmm1 2582; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] 2583; AVX-NEXT: vmovdqa 32(%rcx), %xmm13 2584; AVX-NEXT: vmovdqa 48(%rcx), %xmm3 2585; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] 2586; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 2587; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4,5,6],xmm7[7] 2588; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 2589; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] 2590; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] 2591; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 2592; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5 2593; AVX-NEXT: vorps %ymm2, %ymm5, %ymm5 2594; AVX-NEXT: vmovdqa 48(%r8), %xmm2 2595; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 2596; AVX-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2597; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] 2598; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] 2599; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 2600; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 2601; AVX-NEXT: vandnps %ymm7, %ymm14, %ymm7 2602; AVX-NEXT: vpsrlq $48, %xmm6, %xmm8 2603; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] 2604; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 2605; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] 2606; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] 2607; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 2608; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4 2609; AVX-NEXT: vorps %ymm7, %ymm4, %ymm4 2610; AVX-NEXT: vextractf128 $1, %ymm4, %xmm6 2611; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 2612; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] 2613; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2614; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2,3,4],xmm7[5],xmm4[6,7] 2615; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2616; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,2,2] 2617; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] 2618; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 2619; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] 2620; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] 2621; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2622; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 2623; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] 2624; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 2625; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] 2626; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 2627; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] 2628; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] 2629; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 2630; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 2631; AVX-NEXT: vandnps %ymm6, %ymm8, %ymm6 2632; AVX-NEXT: vandps %ymm7, %ymm8, %ymm7 2633; AVX-NEXT: vorps %ymm6, %ymm7, %ymm7 2634; AVX-NEXT: vpsrlq $48, %xmm11, %xmm6 2635; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm15[1],xmm6[1] 2636; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] 2637; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] 2638; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 2639; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 2640; AVX-NEXT: vextractf128 $1, %ymm5, %xmm1 2641; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] 2642; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 2643; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 2644; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2645; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 2646; AVX-NEXT: vmovdqa 32(%r8), %xmm0 2647; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] 2648; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 2649; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] 2650; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] 2651; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2652; AVX-NEXT: vandnps %ymm6, %ymm14, %ymm2 2653; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3 2654; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 2655; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 2656; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] 2657; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2658; AVX-NEXT: vextractf128 $1, %ymm7, %xmm1 2659; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 2660; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] 2661; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2662; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] 2663; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2664; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] 2665; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] 2666; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 2667; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 2668; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] 2669; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 2670; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2671; AVX-NEXT: vmovdqa 16(%rdx), %xmm9 2672; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] 2673; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] 2674; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] 2675; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] 2676; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2677; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 2678; AVX-NEXT: vmovdqa 16(%rcx), %xmm6 2679; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] 2680; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2681; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 2682; AVX-NEXT: vandps %ymm2, %ymm10, %ymm2 2683; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 2684; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 2685; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2686; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] 2687; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2688; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 2689; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2690; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] 2691; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2692; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 2693; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 2694; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 2695; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 2696; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 2697; AVX-NEXT: vpsrlq $48, %xmm0, %xmm3 2698; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm1[1],xmm3[1] 2699; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2700; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] 2701; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] 2702; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 2703; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2 2704; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3 2705; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 2706; AVX-NEXT: vmovdqa 16(%r8), %xmm8 2707; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 2708; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] 2709; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4,5,6],xmm7[7] 2710; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2711; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] 2712; AVX-NEXT: vmovdqa (%rdi), %xmm15 2713; AVX-NEXT: vmovdqa (%rsi), %xmm7 2714; AVX-NEXT: vpsrlq $48, %xmm7, %xmm2 2715; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1] 2716; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2717; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] 2718; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] 2719; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 2720; AVX-NEXT: vmovdqa (%rdx), %xmm5 2721; AVX-NEXT: vmovdqa (%rcx), %xmm4 2722; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 2723; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] 2724; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] 2725; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 2726; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] 2727; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] 2728; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 2729; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2 2730; AVX-NEXT: vandps %ymm14, %ymm12, %ymm12 2731; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2 2732; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 2733; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 2734; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 2735; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] 2736; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2737; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2738; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] 2739; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 2740; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] 2741; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] 2742; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] 2743; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] 2744; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] 2745; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 2746; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 2747; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 2748; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 2749; AVX-NEXT: vorps %ymm0, %ymm1, %ymm3 2750; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0 2751; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] 2752; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] 2753; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 2754; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] 2755; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] 2756; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2757; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] 2758; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 2759; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 2760; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] 2761; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] 2762; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 2763; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] 2764; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] 2765; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] 2766; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] 2767; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 2768; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 2769; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6 2770; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 2771; AVX-NEXT: vorps %ymm6, %ymm8, %ymm6 2772; AVX-NEXT: vmovdqa (%r8), %xmm8 2773; AVX-NEXT: vextractf128 $1, %ymm6, %xmm9 2774; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,0,1] 2775; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] 2776; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6,7] 2777; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] 2778; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] 2779; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2780; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] 2781; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 2782; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 2783; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] 2784; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 2785; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] 2786; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] 2787; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] 2788; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] 2789; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 2790; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 2791; AVX-NEXT: vandnps %ymm4, %ymm7, %ymm4 2792; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5 2793; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 2794; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 2795; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4,5,6],xmm5[7] 2796; AVX-NEXT: vextractf128 $1, %ymm4, %xmm7 2797; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3,4],xmm5[5],xmm7[6,7] 2798; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] 2799; AVX-NEXT: vmovdqa %xmm4, 32(%r9) 2800; AVX-NEXT: vmovdqa %xmm5, 48(%r9) 2801; AVX-NEXT: vmovdqa %xmm6, (%r9) 2802; AVX-NEXT: vmovdqa %xmm9, 16(%r9) 2803; AVX-NEXT: vmovdqa %xmm3, 96(%r9) 2804; AVX-NEXT: vmovdqa %xmm0, 112(%r9) 2805; AVX-NEXT: vmovdqa %xmm2, 64(%r9) 2806; AVX-NEXT: vmovdqa %xmm1, 80(%r9) 2807; AVX-NEXT: vmovdqa %xmm11, 128(%r9) 2808; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2809; AVX-NEXT: vmovaps %xmm0, 144(%r9) 2810; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2811; AVX-NEXT: vmovaps %xmm0, 160(%r9) 2812; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2813; AVX-NEXT: vmovaps %xmm0, 176(%r9) 2814; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2815; AVX-NEXT: vmovaps %xmm0, 224(%r9) 2816; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2817; AVX-NEXT: vmovaps %xmm0, 240(%r9) 2818; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2819; AVX-NEXT: vmovaps %xmm0, 192(%r9) 2820; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2821; AVX-NEXT: vmovaps %xmm0, 208(%r9) 2822; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2823; AVX-NEXT: vmovaps %xmm0, 288(%r9) 2824; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2825; AVX-NEXT: vmovaps %xmm0, 304(%r9) 2826; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2827; AVX-NEXT: vmovaps %xmm0, 256(%r9) 2828; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2829; AVX-NEXT: vmovaps %xmm0, 272(%r9) 2830; AVX-NEXT: addq $56, %rsp 2831; AVX-NEXT: vzeroupper 2832; AVX-NEXT: retq 2833; 2834; AVX2-LABEL: store_i16_stride5_vf32: 2835; AVX2: # %bb.0: 2836; AVX2-NEXT: subq $72, %rsp 2837; AVX2-NEXT: vmovdqa (%rdi), %ymm2 2838; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 2839; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 2840; AVX2-NEXT: vmovdqa (%r8), %ymm1 2841; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2842; AVX2-NEXT: vmovdqa (%rdx), %xmm6 2843; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 2844; AVX2-NEXT: vmovdqa (%rcx), %xmm7 2845; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 2846; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 2847; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 2848; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2849; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2850; AVX2-NEXT: vmovdqa (%rsi), %xmm12 2851; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 2852; AVX2-NEXT: vmovdqa (%rdi), %xmm11 2853; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 2854; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] 2855; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] 2856; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] 2857; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 2858; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 2859; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 2860; AVX2-NEXT: vpbroadcastq (%r8), %ymm11 2861; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 2862; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 2863; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2864; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 2865; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2866; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2867; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 2868; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] 2869; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] 2870; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 2871; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 2872; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 2873; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 2874; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 2875; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 2876; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm0 2877; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm12 2878; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] 2879; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 2880; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 2881; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm7 2882; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] 2883; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] 2884; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] 2885; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 2886; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 2887; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] 2888; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 2889; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 2890; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2891; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] 2892; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 2893; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] 2894; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] 2895; AVX2-NEXT: vmovdqa (%rsi), %ymm5 2896; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10 2897; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm11 2898; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] 2899; AVX2-NEXT: vmovdqa 32(%rdx), %ymm10 2900; AVX2-NEXT: vpshufb %xmm14, %xmm9, %xmm9 2901; AVX2-NEXT: vmovdqa 32(%rcx), %ymm11 2902; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] 2903; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2904; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 2905; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2906; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 2907; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] 2908; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] 2909; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 2910; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] 2911; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 2912; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] 2913; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] 2914; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] 2915; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 2916; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 2917; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] 2918; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 2919; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2920; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] 2921; AVX2-NEXT: vmovdqa %ymm5, %ymm9 2922; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 2923; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] 2924; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] 2925; AVX2-NEXT: vmovdqa (%rcx), %ymm13 2926; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 2927; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] 2928; AVX2-NEXT: vmovdqa (%rdx), %ymm15 2929; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] 2930; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] 2931; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] 2932; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] 2933; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 2934; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm5 2935; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 2936; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 2937; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm1 2938; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 2939; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 2940; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 2941; AVX2-NEXT: vpshufb %ymm0, %ymm11, %ymm1 2942; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] 2943; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] 2944; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 2945; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 2946; AVX2-NEXT: vmovdqa %ymm4, %ymm7 2947; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] 2948; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] 2949; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 2950; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 2951; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 2952; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 2953; AVX2-NEXT: vpshufb %ymm0, %ymm13, %ymm0 2954; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] 2955; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] 2956; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 2957; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 2958; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] 2959; AVX2-NEXT: vmovdqa %ymm2, %ymm8 2960; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 2961; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2962; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 2963; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 2964; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm0 2965; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 2966; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 2967; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm1 2968; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 2969; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 2970; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm5 2971; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] 2972; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] 2973; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 2974; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 2975; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] 2976; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 2977; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 2978; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 2979; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm4 2980; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] 2981; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 2982; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm5 2983; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] 2984; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 2985; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 2986; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 2987; AVX2-NEXT: # ymm4 = mem[1,1,2,2] 2988; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 2989; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 2990; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 2991; AVX2-NEXT: # ymm4 = mem[1,1,2,2] 2992; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 2993; AVX2-NEXT: vmovdqa %ymm3, 64(%r9) 2994; AVX2-NEXT: vmovdqa %ymm2, 224(%r9) 2995; AVX2-NEXT: vmovdqa %ymm1, 96(%r9) 2996; AVX2-NEXT: vmovdqa %ymm14, 128(%r9) 2997; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2998; AVX2-NEXT: vmovaps %ymm1, 192(%r9) 2999; AVX2-NEXT: vmovdqa %ymm12, 288(%r9) 3000; AVX2-NEXT: vmovdqa %ymm0, 256(%r9) 3001; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3002; AVX2-NEXT: vmovaps %ymm0, 32(%r9) 3003; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3004; AVX2-NEXT: vmovaps %ymm0, 160(%r9) 3005; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3006; AVX2-NEXT: vmovaps %ymm0, (%r9) 3007; AVX2-NEXT: addq $72, %rsp 3008; AVX2-NEXT: vzeroupper 3009; AVX2-NEXT: retq 3010; 3011; AVX2-FP-LABEL: store_i16_stride5_vf32: 3012; AVX2-FP: # %bb.0: 3013; AVX2-FP-NEXT: subq $40, %rsp 3014; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 3015; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 3016; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm4 3017; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 3018; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3019; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2 3020; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3021; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 3022; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 3023; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3024; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 3025; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm9 3026; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] 3027; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3028; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm13 3029; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11 3030; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3031; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm9 3032; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm14 3033; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] 3034; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] 3035; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] 3036; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 3037; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 3038; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] 3039; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 3040; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 3041; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 3042; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm12 3043; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] 3044; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm12 3045; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] 3046; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 3047; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm14 3048; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] 3049; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 3050; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] 3051; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] 3052; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 3053; AVX2-FP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 3054; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3055; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3056; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] 3057; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 3058; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3059; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 3060; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 3061; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 3062; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 3063; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] 3064; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 3065; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3066; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm7 3067; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 3068; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 3069; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3070; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0 3071; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 3072; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm9 3073; AVX2-FP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 3074; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10 3075; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3076; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] 3077; AVX2-FP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 3078; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 3079; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 3080; AVX2-FP-NEXT: vpbroadcastq 32(%r8), %ymm8 3081; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 3082; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 3083; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 3084; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm1 3085; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] 3086; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] 3087; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 3088; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 3089; AVX2-FP-NEXT: vpshufb %ymm12, %ymm10, %ymm11 3090; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] 3091; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] 3092; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 3093; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 3094; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 3095; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 3096; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 3097; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] 3098; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] 3099; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm14 3100; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 3101; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm15 3102; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] 3103; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] 3104; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 3105; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] 3106; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 3107; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm6 3108; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3109; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 3110; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm1 3111; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 3112; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3113; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] 3114; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 3115; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] 3116; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] 3117; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] 3118; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1] 3119; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm0 3120; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] 3121; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] 3122; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3123; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 3124; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 3125; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 3126; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 3127; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] 3128; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 3129; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 3130; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] 3131; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] 3132; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3133; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 3134; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 3135; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm2 3136; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 3137; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 3138; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm2 3139; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 3140; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3141; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm6 3142; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] 3143; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] 3144; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3145; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 3146; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] 3147; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] 3148; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 3149; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 3150; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 3151; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] 3152; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] 3153; AVX2-FP-NEXT: vpshufb %ymm7, %ymm11, %ymm6 3154; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] 3155; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] 3156; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 3157; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3158; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] 3159; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3160; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 3161; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3162; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] 3163; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 3164; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%r9) 3165; AVX2-FP-NEXT: vmovdqa %ymm3, 224(%r9) 3166; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%r9) 3167; AVX2-FP-NEXT: vmovdqa %ymm13, 128(%r9) 3168; AVX2-FP-NEXT: vmovdqa %ymm12, 288(%r9) 3169; AVX2-FP-NEXT: vmovdqa %ymm0, 256(%r9) 3170; AVX2-FP-NEXT: vmovdqa %ymm8, 160(%r9) 3171; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3172; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) 3173; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3174; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) 3175; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3176; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) 3177; AVX2-FP-NEXT: addq $40, %rsp 3178; AVX2-FP-NEXT: vzeroupper 3179; AVX2-FP-NEXT: retq 3180; 3181; AVX2-FCP-LABEL: store_i16_stride5_vf32: 3182; AVX2-FCP: # %bb.0: 3183; AVX2-FCP-NEXT: subq $40, %rsp 3184; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 3185; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 3186; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 3187; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 3188; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3189; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm2 3190; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3191; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm0 3192; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 3193; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3194; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 3195; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm9 3196; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] 3197; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3198; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13 3199; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm11 3200; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3201; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm9 3202; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm14 3203; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] 3204; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] 3205; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] 3206; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 3207; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 3208; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] 3209; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 3210; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 3211; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 3212; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm12 3213; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] 3214; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm12 3215; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] 3216; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 3217; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm14 3218; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] 3219; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 3220; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] 3221; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] 3222; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 3223; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 3224; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3225; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3226; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] 3227; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 3228; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3229; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 3230; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 3231; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 3232; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 3233; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] 3234; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 3235; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3236; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm7 3237; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 3238; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 3239; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3240; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 3241; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 3242; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 3243; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 3244; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm10 3245; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3246; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] 3247; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 3248; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 3249; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 3250; AVX2-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 3251; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 3252; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 3253; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] 3254; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm1 3255; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] 3256; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] 3257; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 3258; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 3259; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm11 3260; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] 3261; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] 3262; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 3263; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 3264; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 3265; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 3266; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 3267; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] 3268; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] 3269; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm14 3270; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 3271; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 3272; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] 3273; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] 3274; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 3275; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] 3276; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 3277; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm6 3278; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3279; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 3280; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 3281; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 3282; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3283; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 3284; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 3285; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] 3286; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] 3287; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] 3288; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 3289; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm0 3290; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] 3291; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] 3292; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3293; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 3294; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 3295; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 3296; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 3297; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] 3298; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 3299; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 3300; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] 3301; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] 3302; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3303; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 3304; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 3305; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm2 3306; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 3307; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 3308; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm2 3309; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 3310; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3311; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm6 3312; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] 3313; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] 3314; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3315; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 3316; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] 3317; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] 3318; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 3319; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 3320; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 3321; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] 3322; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] 3323; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm6 3324; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] 3325; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] 3326; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 3327; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3328; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] 3329; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3330; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 3331; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3332; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] 3333; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 3334; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%r9) 3335; AVX2-FCP-NEXT: vmovdqa %ymm3, 224(%r9) 3336; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%r9) 3337; AVX2-FCP-NEXT: vmovdqa %ymm13, 128(%r9) 3338; AVX2-FCP-NEXT: vmovdqa %ymm12, 288(%r9) 3339; AVX2-FCP-NEXT: vmovdqa %ymm0, 256(%r9) 3340; AVX2-FCP-NEXT: vmovdqa %ymm8, 160(%r9) 3341; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3342; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) 3343; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3344; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) 3345; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3346; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) 3347; AVX2-FCP-NEXT: addq $40, %rsp 3348; AVX2-FCP-NEXT: vzeroupper 3349; AVX2-FCP-NEXT: retq 3350; 3351; AVX512-LABEL: store_i16_stride5_vf32: 3352; AVX512: # %bb.0: 3353; AVX512-NEXT: vmovdqa 32(%rdx), %xmm9 3354; AVX512-NEXT: vmovdqa (%rcx), %xmm12 3355; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 3356; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 3357; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3358; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0 3359; AVX512-NEXT: vmovdqa64 (%rdx), %ymm17 3360; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 3361; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,2,3,3,7,6,7,7] 3362; AVX512-NEXT: vmovdqa (%rcx), %ymm1 3363; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 3364; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 3365; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 3366; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 3367; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] 3368; AVX512-NEXT: vmovdqa (%rsi), %xmm14 3369; AVX512-NEXT: vmovdqa 32(%rsi), %xmm8 3370; AVX512-NEXT: vmovdqa (%rdi), %xmm15 3371; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 3372; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 3373; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 3374; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 3375; AVX512-NEXT: vmovdqa64 (%rdi), %ymm20 3376; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,3,2,3,6,7,6,7] 3377; AVX512-NEXT: vmovdqa (%rsi), %ymm5 3378; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 3379; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] 3380; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] 3381; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] 3382; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] 3383; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 3384; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0)) 3385; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm0 3386; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm2 3387; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 3388; AVX512-NEXT: vpternlogd {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm3)) 3389; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 3390; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3391; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm0 3392; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 3393; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm18 3394; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2] 3395; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] 3396; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3397; AVX512-NEXT: vpshufb %xmm7, %xmm8, %xmm8 3398; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm11 3399; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] 3400; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 3401; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm11 3402; AVX512-NEXT: vmovdqa 32(%rcx), %ymm0 3403; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3404; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm2 3405; AVX512-NEXT: vmovdqa64 %ymm4, %ymm23 3406; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[3,0,3,0,7,4,7,4] 3407; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] 3408; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3409; AVX512-NEXT: vpshufb %xmm8, %xmm10, %xmm10 3410; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] 3411; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] 3412; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] 3413; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 3414; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm11)) 3415; AVX512-NEXT: vmovdqa (%r8), %ymm9 3416; AVX512-NEXT: vmovdqa 32(%r8), %ymm10 3417; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 3418; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm4 3419; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3420; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] 3421; AVX512-NEXT: vpandnq %ymm10, %ymm21, %ymm10 3422; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 3423; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & zmm21) 3424; AVX512-NEXT: vmovdqa (%rdx), %xmm2 3425; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] 3426; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 3427; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 3428; AVX512-NEXT: vpshufb %xmm8, %xmm12, %xmm8 3429; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] 3430; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] 3431; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 3432; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 3433; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm4 3434; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm7 3435; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] 3436; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 3437; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 3438; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 3439; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 3440; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] 3441; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 3442; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm7 & (zmm4 ^ zmm2)) 3443; AVX512-NEXT: vpbroadcastq (%r8), %ymm2 3444; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] 3445; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 3446; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) 3447; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] 3448; AVX512-NEXT: vprolq $16, %ymm3, %ymm8 3449; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] 3450; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 3451; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[2,3,2,3,6,7,6,7] 3452; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 3453; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] 3454; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] 3455; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] 3456; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 3457; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3458; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 3459; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm8 3460; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[1,1,1,2,5,5,5,6] 3461; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] 3462; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3463; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] 3464; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 3465; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 3466; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] 3467; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] 3468; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 3469; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm7 & (zmm0 ^ zmm4)) 3470; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm4 3471; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm6 3472; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 3473; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3474; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm6 & (zmm4 ^ zmm0)) 3475; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0 3476; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 3477; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] 3478; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] 3479; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,1,4,5,6,5] 3480; AVX512-NEXT: vprolq $16, %ymm5, %ymm5 3481; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] 3482; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 3483; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 3484; AVX512-NEXT: vmovdqa64 %ymm23, %ymm5 3485; AVX512-NEXT: vpshufb %ymm5, %ymm1, %ymm5 3486; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[3,0,3,0,7,4,7,4] 3487; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] 3488; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 3489; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] 3490; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] 3491; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3492; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 3493; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm16 & (zmm1 ^ zmm0)) 3494; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 3495; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm3 3496; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 3497; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) 3498; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) 3499; AVX512-NEXT: vmovdqa64 %zmm4, 256(%r9) 3500; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) 3501; AVX512-NEXT: vmovdqa64 %zmm10, 192(%r9) 3502; AVX512-NEXT: vmovdqa64 %zmm19, 128(%r9) 3503; AVX512-NEXT: vzeroupper 3504; AVX512-NEXT: retq 3505; 3506; AVX512-FCP-LABEL: store_i16_stride5_vf32: 3507; AVX512-FCP: # %bb.0: 3508; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 3509; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3510; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0 3511; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 3512; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 3513; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,1,2,2] 3514; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3515; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 3516; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 3517; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3518; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 3519; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 3520; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm2 3521; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 3522; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3523; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3524; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4 3525; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 3526; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3527; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm1 3528; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 3529; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 3530; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 3531; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,0,3,0,7,4,7,4] 3532; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 3533; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm11 3534; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3535; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm2 3536; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 3537; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,2,2,2] 3538; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] 3539; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 3540; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 3541; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 3542; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 3543; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 3544; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 3545; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm3 3546; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm25 3547; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3548; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] 3549; AVX512-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 3550; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 3551; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm1 & zmm16) 3552; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 3553; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 3554; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 3555; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,2,2,2] 3556; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2],xmm14[3],xmm3[4,5],xmm14[6],xmm3[7] 3557; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 3558; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3559; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3560; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 3561; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] 3562; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 3563; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm1 3564; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 3565; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] 3566; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 3567; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 3568; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] 3569; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 3570; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 3571; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 3572; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 3573; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 3574; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 3575; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm14 ^ (zmm16 & (zmm1 ^ zmm14)) 3576; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm3 3577; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] 3578; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 3579; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm1)) 3580; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15 3581; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 3582; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 3583; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm14 3584; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] 3585; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 3586; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,1,4,5,6,5] 3587; AVX512-FCP-NEXT: vprolq $16, %ymm15, %ymm13 3588; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm3[2],ymm13[3],ymm3[4],ymm13[5,6],ymm3[7],ymm13[8,9],ymm3[10],ymm13[11],ymm3[12],ymm13[13,14],ymm3[15] 3589; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 3590; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 3591; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 3592; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 3593; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[3,0,3,0,7,4,7,4] 3594; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] 3595; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3596; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] 3597; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 3598; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[1,1,1,2,5,5,5,6] 3599; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15] 3600; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 3601; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 3602; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 3603; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm19 & (zmm2 ^ zmm3)) 3604; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3605; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 3606; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 3607; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 3608; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 3609; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) 3610; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 3611; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] 3612; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 3613; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[2,3,2,3,6,7,6,7] 3614; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] 3615; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] 3616; AVX512-FCP-NEXT: vprolq $16, %ymm5, %ymm5 3617; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] 3618; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] 3619; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 3620; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 3621; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 3622; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] 3623; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] 3624; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 3625; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] 3626; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] 3627; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] 3628; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 3629; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm16 & (zmm8 ^ zmm7)) 3630; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 3631; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 3632; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 3633; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm8)) 3634; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] 3635; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 3636; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 3637; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 3638; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,2,3,3,7,6,7,7] 3639; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] 3640; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] 3641; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 3642; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 3643; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 3644; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3645; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 3646; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3647; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm3 3648; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] 3649; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] 3650; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] 3651; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 3652; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm19 & (zmm5 ^ zmm4)) 3653; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 3654; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 3655; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 3656; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) 3657; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) 3658; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) 3659; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 3660; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 3661; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 3662; AVX512-FCP-NEXT: vzeroupper 3663; AVX512-FCP-NEXT: retq 3664; 3665; AVX512DQ-LABEL: store_i16_stride5_vf32: 3666; AVX512DQ: # %bb.0: 3667; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm9 3668; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm12 3669; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 3670; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 3671; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3672; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm0 3673; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm17 3674; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 3675; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,2,3,3,7,6,7,7] 3676; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 3677; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 3678; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 3679; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 3680; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 3681; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] 3682; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm14 3683; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm8 3684; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm15 3685; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 3686; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] 3687; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 3688; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 3689; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm20 3690; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,3,2,3,6,7,6,7] 3691; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm5 3692; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 3693; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] 3694; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] 3695; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] 3696; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] 3697; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 3698; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0)) 3699; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm0 3700; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm2 3701; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 3702; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm3)) 3703; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 3704; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3705; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm0 3706; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 3707; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm18 3708; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2] 3709; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] 3710; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3711; AVX512DQ-NEXT: vpshufb %xmm7, %xmm8, %xmm8 3712; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm11 3713; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] 3714; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 3715; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm11 3716; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm0 3717; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3718; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm2 3719; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm23 3720; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[3,0,3,0,7,4,7,4] 3721; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] 3722; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3723; AVX512DQ-NEXT: vpshufb %xmm8, %xmm10, %xmm10 3724; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] 3725; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] 3726; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] 3727; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 3728; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm11)) 3729; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 3730; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm10 3731; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 3732; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm4 3733; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3734; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] 3735; AVX512DQ-NEXT: vpandnq %ymm10, %ymm21, %ymm10 3736; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 3737; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & zmm21) 3738; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 3739; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] 3740; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 3741; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 3742; AVX512DQ-NEXT: vpshufb %xmm8, %xmm12, %xmm8 3743; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] 3744; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] 3745; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 3746; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 3747; AVX512DQ-NEXT: vpshufb %xmm7, %xmm14, %xmm4 3748; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm7 3749; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] 3750; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 3751; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 3752; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 3753; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 3754; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] 3755; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 3756; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm7 & (zmm4 ^ zmm2)) 3757; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm2 3758; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] 3759; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 3760; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) 3761; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] 3762; AVX512DQ-NEXT: vprolq $16, %ymm3, %ymm8 3763; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] 3764; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 3765; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[2,3,2,3,6,7,6,7] 3766; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 3767; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] 3768; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] 3769; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] 3770; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 3771; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3772; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 3773; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm8 3774; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[1,1,1,2,5,5,5,6] 3775; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] 3776; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3777; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] 3778; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 3779; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 3780; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] 3781; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] 3782; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 3783; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm7 & (zmm0 ^ zmm4)) 3784; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm4 3785; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm6 3786; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 3787; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3788; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm6 & (zmm4 ^ zmm0)) 3789; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0 3790; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 3791; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] 3792; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] 3793; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,1,4,5,6,5] 3794; AVX512DQ-NEXT: vprolq $16, %ymm5, %ymm5 3795; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] 3796; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 3797; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 3798; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm5 3799; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm5 3800; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[3,0,3,0,7,4,7,4] 3801; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] 3802; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 3803; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] 3804; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] 3805; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3806; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 3807; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm16 & (zmm1 ^ zmm0)) 3808; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 3809; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm3 3810; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 3811; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) 3812; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) 3813; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%r9) 3814; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) 3815; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%r9) 3816; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%r9) 3817; AVX512DQ-NEXT: vzeroupper 3818; AVX512DQ-NEXT: retq 3819; 3820; AVX512DQ-FCP-LABEL: store_i16_stride5_vf32: 3821; AVX512DQ-FCP: # %bb.0: 3822; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 3823; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 3824; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0 3825; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 3826; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 3827; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,1,2,2] 3828; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3829; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 3830; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 3831; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 3832; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 3833; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 3834; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm2 3835; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 3836; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3837; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3838; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm4 3839; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 3840; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 3841; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm1 3842; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 3843; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 3844; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 3845; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,0,3,0,7,4,7,4] 3846; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 3847; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm11 3848; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 3849; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm2 3850; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 3851; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,2,2,2] 3852; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] 3853; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 3854; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 3855; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 3856; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 3857; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 3858; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 3859; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm3 3860; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm25 3861; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 3862; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] 3863; AVX512DQ-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 3864; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 3865; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm1 & zmm16) 3866; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 3867; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 3868; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 3869; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,2,2,2] 3870; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2],xmm14[3],xmm3[4,5],xmm14[6],xmm3[7] 3871; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 3872; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 3873; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3874; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 3875; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] 3876; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 3877; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm1 3878; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 3879; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] 3880; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 3881; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 3882; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] 3883; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 3884; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 3885; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 3886; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 3887; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 3888; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 3889; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm14 ^ (zmm16 & (zmm1 ^ zmm14)) 3890; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm3 3891; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] 3892; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 3893; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm1)) 3894; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15 3895; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 3896; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 3897; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm14 3898; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] 3899; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 3900; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,1,4,5,6,5] 3901; AVX512DQ-FCP-NEXT: vprolq $16, %ymm15, %ymm13 3902; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm3[2],ymm13[3],ymm3[4],ymm13[5,6],ymm3[7],ymm13[8,9],ymm3[10],ymm13[11],ymm3[12],ymm13[13,14],ymm3[15] 3903; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 3904; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 3905; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 3906; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 3907; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[3,0,3,0,7,4,7,4] 3908; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] 3909; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 3910; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] 3911; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 3912; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[1,1,1,2,5,5,5,6] 3913; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15] 3914; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 3915; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 3916; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 3917; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm19 & (zmm2 ^ zmm3)) 3918; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 3919; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 3920; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 3921; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 3922; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 3923; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) 3924; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 3925; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] 3926; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 3927; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[2,3,2,3,6,7,6,7] 3928; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] 3929; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] 3930; AVX512DQ-FCP-NEXT: vprolq $16, %ymm5, %ymm5 3931; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] 3932; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] 3933; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 3934; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 3935; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 3936; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] 3937; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] 3938; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 3939; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] 3940; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] 3941; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] 3942; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 3943; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm16 & (zmm8 ^ zmm7)) 3944; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 3945; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 3946; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 3947; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm8)) 3948; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] 3949; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 3950; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 3951; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 3952; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,2,3,3,7,6,7,7] 3953; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] 3954; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] 3955; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 3956; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 3957; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 3958; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 3959; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 3960; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3961; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm3 3962; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] 3963; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] 3964; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] 3965; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 3966; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm19 & (zmm5 ^ zmm4)) 3967; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 3968; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 3969; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 3970; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) 3971; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) 3972; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) 3973; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 3974; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 3975; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) 3976; AVX512DQ-FCP-NEXT: vzeroupper 3977; AVX512DQ-FCP-NEXT: retq 3978; 3979; AVX512BW-LABEL: store_i16_stride5_vf32: 3980; AVX512BW: # %bb.0: 3981; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 3982; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 3983; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 3984; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 3985; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 3986; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 3987; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 3988; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 3989; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 3990; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 3991; AVX512BW-NEXT: kmovd %eax, %k1 3992; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 3993; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 3994; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 3995; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 3996; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 3997; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 3998; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 3999; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 4000; AVX512BW-NEXT: kmovd %eax, %k2 4001; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} 4002; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 4003; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 4004; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 4005; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4006; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 4007; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 4008; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} 4009; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 4010; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 4011; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 4012; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 4013; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 4014; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 4015; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 4016; AVX512BW-NEXT: kmovd %eax, %k2 4017; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 4018; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 4019; AVX512BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 4020; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 4021; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 4022; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 4023; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 4024; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} 4025; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 4026; AVX512BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 4027; AVX512BW-NEXT: vmovdqa64 %zmm1, 256(%r9) 4028; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) 4029; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) 4030; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r9) 4031; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) 4032; AVX512BW-NEXT: vzeroupper 4033; AVX512BW-NEXT: retq 4034; 4035; AVX512BW-FCP-LABEL: store_i16_stride5_vf32: 4036; AVX512BW-FCP: # %bb.0: 4037; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 4038; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 4039; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 4040; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 4041; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 4042; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 4043; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 4044; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 4045; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4046; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 4047; AVX512BW-FCP-NEXT: kmovd %eax, %k1 4048; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4049; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 4050; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 4051; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 4052; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4053; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 4054; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4055; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 4056; AVX512BW-FCP-NEXT: kmovd %eax, %k2 4057; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} 4058; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 4059; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 4060; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 4061; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4062; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 4063; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 4064; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} 4065; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 4066; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 4067; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 4068; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 4069; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 4070; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 4071; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 4072; AVX512BW-FCP-NEXT: kmovd %eax, %k2 4073; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 4074; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 4075; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 4076; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 4077; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 4078; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 4079; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 4080; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} 4081; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 4082; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 4083; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) 4084; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) 4085; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) 4086; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) 4087; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) 4088; AVX512BW-FCP-NEXT: vzeroupper 4089; AVX512BW-FCP-NEXT: retq 4090; 4091; AVX512DQ-BW-LABEL: store_i16_stride5_vf32: 4092; AVX512DQ-BW: # %bb.0: 4093; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 4094; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 4095; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 4096; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 4097; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 4098; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 4099; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 4100; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 4101; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4102; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 4103; AVX512DQ-BW-NEXT: kmovd %eax, %k1 4104; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4105; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 4106; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 4107; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 4108; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4109; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 4110; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4111; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 4112; AVX512DQ-BW-NEXT: kmovd %eax, %k2 4113; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} 4114; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 4115; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 4116; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 4117; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4118; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 4119; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 4120; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} 4121; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 4122; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 4123; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 4124; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 4125; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 4126; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 4127; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 4128; AVX512DQ-BW-NEXT: kmovd %eax, %k2 4129; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 4130; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 4131; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 4132; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 4133; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 4134; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 4135; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 4136; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} 4137; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 4138; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 4139; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 256(%r9) 4140; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) 4141; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%r9) 4142; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r9) 4143; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) 4144; AVX512DQ-BW-NEXT: vzeroupper 4145; AVX512DQ-BW-NEXT: retq 4146; 4147; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf32: 4148; AVX512DQ-BW-FCP: # %bb.0: 4149; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 4150; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 4151; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 4152; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 4153; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 4154; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 4155; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 4156; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 4157; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4158; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 4159; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 4160; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4161; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 4162; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 4163; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 4164; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 4165; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 4166; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4167; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 4168; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 4169; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} 4170; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 4171; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 4172; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 4173; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 4174; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 4175; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 4176; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} 4177; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 4178; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 4179; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 4180; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 4181; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 4182; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 4183; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 4184; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 4185; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 4186; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 4187; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 4188; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 4189; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 4190; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 4191; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 4192; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} 4193; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 4194; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 4195; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) 4196; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) 4197; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) 4198; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) 4199; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) 4200; AVX512DQ-BW-FCP-NEXT: vzeroupper 4201; AVX512DQ-BW-FCP-NEXT: retq 4202 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64 4203 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64 4204 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 64 4205 %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 64 4206 %in.vec4 = load <32 x i16>, ptr %in.vecptr4, align 64 4207 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4208 %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 4209 %3 = shufflevector <64 x i16> %1, <64 x i16> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 4210 %4 = shufflevector <32 x i16> %in.vec4, <32 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 4211 %5 = shufflevector <128 x i16> %3, <128 x i16> %4, <160 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159> 4212 %interleaved.vec = shufflevector <160 x i16> %5, <160 x i16> poison, <160 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 1, i32 33, i32 65, i32 97, i32 129, i32 2, i32 34, i32 66, i32 98, i32 130, i32 3, i32 35, i32 67, i32 99, i32 131, i32 4, i32 36, i32 68, i32 100, i32 132, i32 5, i32 37, i32 69, i32 101, i32 133, i32 6, i32 38, i32 70, i32 102, i32 134, i32 7, i32 39, i32 71, i32 103, i32 135, i32 8, i32 40, i32 72, i32 104, i32 136, i32 9, i32 41, i32 73, i32 105, i32 137, i32 10, i32 42, i32 74, i32 106, i32 138, i32 11, i32 43, i32 75, i32 107, i32 139, i32 12, i32 44, i32 76, i32 108, i32 140, i32 13, i32 45, i32 77, i32 109, i32 141, i32 14, i32 46, i32 78, i32 110, i32 142, i32 15, i32 47, i32 79, i32 111, i32 143, i32 16, i32 48, i32 80, i32 112, i32 144, i32 17, i32 49, i32 81, i32 113, i32 145, i32 18, i32 50, i32 82, i32 114, i32 146, i32 19, i32 51, i32 83, i32 115, i32 147, i32 20, i32 52, i32 84, i32 116, i32 148, i32 21, i32 53, i32 85, i32 117, i32 149, i32 22, i32 54, i32 86, i32 118, i32 150, i32 23, i32 55, i32 87, i32 119, i32 151, i32 24, i32 56, i32 88, i32 120, i32 152, i32 25, i32 57, i32 89, i32 121, i32 153, i32 26, i32 58, i32 90, i32 122, i32 154, i32 27, i32 59, i32 91, i32 123, i32 155, i32 28, i32 60, i32 92, i32 124, i32 156, i32 29, i32 61, i32 93, i32 125, i32 157, i32 30, i32 62, i32 94, i32 126, i32 158, i32 31, i32 63, i32 95, i32 127, i32 159> 4213 store <160 x i16> %interleaved.vec, ptr %out.vec, align 64 4214 ret void 4215} 4216 4217define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 4218; SSE-LABEL: store_i16_stride5_vf64: 4219; SSE: # %bb.0: 4220; SSE-NEXT: subq $616, %rsp # imm = 0x268 4221; SSE-NEXT: movdqa (%rdi), %xmm14 4222; SSE-NEXT: movdqa 16(%rdi), %xmm4 4223; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4224; SSE-NEXT: movdqa (%rsi), %xmm12 4225; SSE-NEXT: movdqa 16(%rsi), %xmm11 4226; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4227; SSE-NEXT: movdqa (%rdx), %xmm7 4228; SSE-NEXT: movdqa (%rcx), %xmm0 4229; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4230; SSE-NEXT: movdqa 16(%rcx), %xmm13 4231; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4232; SSE-NEXT: movdqa (%r8), %xmm15 4233; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4234; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] 4235; SSE-NEXT: movdqa %xmm9, %xmm1 4236; SSE-NEXT: pandn %xmm14, %xmm1 4237; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] 4238; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 4239; SSE-NEXT: pand %xmm9, %xmm3 4240; SSE-NEXT: por %xmm1, %xmm3 4241; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] 4242; SSE-NEXT: movdqa %xmm1, %xmm5 4243; SSE-NEXT: pandn %xmm3, %xmm5 4244; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] 4245; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,65535,0] 4246; SSE-NEXT: pand %xmm10, %xmm3 4247; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] 4248; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 4249; SSE-NEXT: movdqa %xmm10, %xmm8 4250; SSE-NEXT: pandn %xmm6, %xmm8 4251; SSE-NEXT: por %xmm3, %xmm8 4252; SSE-NEXT: pand %xmm1, %xmm8 4253; SSE-NEXT: por %xmm5, %xmm8 4254; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] 4255; SSE-NEXT: pand %xmm2, %xmm8 4256; SSE-NEXT: movdqa %xmm2, %xmm0 4257; SSE-NEXT: pandn %xmm15, %xmm0 4258; SSE-NEXT: por %xmm8, %xmm0 4259; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4260; SSE-NEXT: movdqa %xmm9, %xmm3 4261; SSE-NEXT: pandn %xmm4, %xmm3 4262; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,3,3,3,4,5,6,7] 4263; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 4264; SSE-NEXT: pand %xmm9, %xmm5 4265; SSE-NEXT: por %xmm3, %xmm5 4266; SSE-NEXT: movdqa %xmm1, %xmm3 4267; SSE-NEXT: pandn %xmm5, %xmm3 4268; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] 4269; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 4270; SSE-NEXT: movdqa %xmm10, %xmm6 4271; SSE-NEXT: pandn %xmm5, %xmm6 4272; SSE-NEXT: movdqa 16(%rdx), %xmm5 4273; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] 4274; SSE-NEXT: pand %xmm10, %xmm8 4275; SSE-NEXT: por %xmm8, %xmm6 4276; SSE-NEXT: pand %xmm1, %xmm6 4277; SSE-NEXT: por %xmm3, %xmm6 4278; SSE-NEXT: movdqa 16(%r8), %xmm3 4279; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4280; SSE-NEXT: pand %xmm2, %xmm6 4281; SSE-NEXT: movdqa %xmm2, %xmm0 4282; SSE-NEXT: pandn %xmm3, %xmm0 4283; SSE-NEXT: por %xmm6, %xmm0 4284; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4285; SSE-NEXT: movdqa 32(%rdi), %xmm0 4286; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4287; SSE-NEXT: movdqa %xmm9, %xmm3 4288; SSE-NEXT: pandn %xmm0, %xmm3 4289; SSE-NEXT: movdqa 32(%rsi), %xmm0 4290; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4291; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] 4292; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 4293; SSE-NEXT: pand %xmm9, %xmm6 4294; SSE-NEXT: por %xmm3, %xmm6 4295; SSE-NEXT: movdqa %xmm1, %xmm3 4296; SSE-NEXT: pandn %xmm6, %xmm3 4297; SSE-NEXT: movdqa 32(%rcx), %xmm0 4298; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4299; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] 4300; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 4301; SSE-NEXT: movdqa %xmm10, %xmm8 4302; SSE-NEXT: pandn %xmm6, %xmm8 4303; SSE-NEXT: movdqa 32(%rdx), %xmm6 4304; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] 4305; SSE-NEXT: pand %xmm10, %xmm11 4306; SSE-NEXT: por %xmm11, %xmm8 4307; SSE-NEXT: pand %xmm1, %xmm8 4308; SSE-NEXT: por %xmm3, %xmm8 4309; SSE-NEXT: pand %xmm2, %xmm8 4310; SSE-NEXT: movdqa 32(%r8), %xmm3 4311; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4312; SSE-NEXT: movdqa %xmm2, %xmm0 4313; SSE-NEXT: pandn %xmm3, %xmm0 4314; SSE-NEXT: por %xmm8, %xmm0 4315; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4316; SSE-NEXT: movdqa 48(%rdi), %xmm0 4317; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4318; SSE-NEXT: movdqa %xmm9, %xmm3 4319; SSE-NEXT: pandn %xmm0, %xmm3 4320; SSE-NEXT: movdqa 48(%rsi), %xmm0 4321; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4322; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] 4323; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 4324; SSE-NEXT: pand %xmm9, %xmm8 4325; SSE-NEXT: por %xmm3, %xmm8 4326; SSE-NEXT: movdqa %xmm1, %xmm3 4327; SSE-NEXT: pandn %xmm8, %xmm3 4328; SSE-NEXT: movdqa 48(%rcx), %xmm0 4329; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4330; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] 4331; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] 4332; SSE-NEXT: movdqa %xmm10, %xmm11 4333; SSE-NEXT: pandn %xmm8, %xmm11 4334; SSE-NEXT: movdqa 48(%rdx), %xmm8 4335; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] 4336; SSE-NEXT: pand %xmm10, %xmm15 4337; SSE-NEXT: por %xmm15, %xmm11 4338; SSE-NEXT: pand %xmm1, %xmm11 4339; SSE-NEXT: por %xmm3, %xmm11 4340; SSE-NEXT: pand %xmm2, %xmm11 4341; SSE-NEXT: movdqa 48(%r8), %xmm3 4342; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4343; SSE-NEXT: movdqa %xmm2, %xmm0 4344; SSE-NEXT: pandn %xmm3, %xmm0 4345; SSE-NEXT: por %xmm11, %xmm0 4346; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4347; SSE-NEXT: movdqa 64(%rdi), %xmm0 4348; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4349; SSE-NEXT: movdqa %xmm9, %xmm3 4350; SSE-NEXT: pandn %xmm0, %xmm3 4351; SSE-NEXT: movdqa 64(%rsi), %xmm0 4352; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4353; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,3,3,3,4,5,6,7] 4354; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4355; SSE-NEXT: pand %xmm9, %xmm11 4356; SSE-NEXT: por %xmm3, %xmm11 4357; SSE-NEXT: movdqa %xmm1, %xmm15 4358; SSE-NEXT: pandn %xmm11, %xmm15 4359; SSE-NEXT: movdqa 64(%rcx), %xmm0 4360; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4361; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] 4362; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 4363; SSE-NEXT: movdqa %xmm10, %xmm11 4364; SSE-NEXT: pandn %xmm3, %xmm11 4365; SSE-NEXT: movdqa 64(%rdx), %xmm0 4366; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4367; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 4368; SSE-NEXT: pand %xmm10, %xmm0 4369; SSE-NEXT: por %xmm0, %xmm11 4370; SSE-NEXT: pand %xmm1, %xmm11 4371; SSE-NEXT: por %xmm15, %xmm11 4372; SSE-NEXT: pand %xmm2, %xmm11 4373; SSE-NEXT: movdqa 64(%r8), %xmm3 4374; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 4375; SSE-NEXT: movdqa %xmm2, %xmm0 4376; SSE-NEXT: pandn %xmm3, %xmm0 4377; SSE-NEXT: por %xmm11, %xmm0 4378; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4379; SSE-NEXT: movdqa 80(%rdi), %xmm3 4380; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4381; SSE-NEXT: movdqa %xmm9, %xmm0 4382; SSE-NEXT: pandn %xmm3, %xmm0 4383; SSE-NEXT: movdqa 80(%rsi), %xmm3 4384; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4385; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] 4386; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4387; SSE-NEXT: pand %xmm9, %xmm11 4388; SSE-NEXT: por %xmm0, %xmm11 4389; SSE-NEXT: movdqa %xmm1, %xmm0 4390; SSE-NEXT: pandn %xmm11, %xmm0 4391; SSE-NEXT: movdqa 80(%rcx), %xmm3 4392; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4393; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] 4394; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4395; SSE-NEXT: movdqa %xmm10, %xmm15 4396; SSE-NEXT: pandn %xmm11, %xmm15 4397; SSE-NEXT: movdqa 80(%rdx), %xmm3 4398; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4399; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] 4400; SSE-NEXT: pand %xmm10, %xmm11 4401; SSE-NEXT: por %xmm11, %xmm15 4402; SSE-NEXT: pand %xmm1, %xmm15 4403; SSE-NEXT: por %xmm0, %xmm15 4404; SSE-NEXT: pand %xmm2, %xmm15 4405; SSE-NEXT: movdqa 80(%r8), %xmm3 4406; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4407; SSE-NEXT: movdqa %xmm2, %xmm0 4408; SSE-NEXT: pandn %xmm3, %xmm0 4409; SSE-NEXT: por %xmm15, %xmm0 4410; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4411; SSE-NEXT: movdqa 96(%rdi), %xmm4 4412; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4413; SSE-NEXT: movdqa %xmm9, %xmm0 4414; SSE-NEXT: pandn %xmm4, %xmm0 4415; SSE-NEXT: movdqa 96(%rsi), %xmm3 4416; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4417; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] 4418; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4419; SSE-NEXT: pand %xmm9, %xmm11 4420; SSE-NEXT: por %xmm0, %xmm11 4421; SSE-NEXT: movdqa %xmm1, %xmm0 4422; SSE-NEXT: pandn %xmm11, %xmm0 4423; SSE-NEXT: movdqa 96(%rcx), %xmm3 4424; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4425; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] 4426; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4427; SSE-NEXT: movdqa %xmm10, %xmm15 4428; SSE-NEXT: pandn %xmm11, %xmm15 4429; SSE-NEXT: movdqa 96(%rdx), %xmm3 4430; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4431; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] 4432; SSE-NEXT: pand %xmm10, %xmm11 4433; SSE-NEXT: por %xmm11, %xmm15 4434; SSE-NEXT: pand %xmm1, %xmm15 4435; SSE-NEXT: por %xmm0, %xmm15 4436; SSE-NEXT: pand %xmm2, %xmm15 4437; SSE-NEXT: movdqa 96(%r8), %xmm3 4438; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4439; SSE-NEXT: movdqa %xmm2, %xmm0 4440; SSE-NEXT: pandn %xmm3, %xmm0 4441; SSE-NEXT: por %xmm15, %xmm0 4442; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4443; SSE-NEXT: movdqa 112(%rdi), %xmm4 4444; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4445; SSE-NEXT: movdqa %xmm9, %xmm0 4446; SSE-NEXT: pandn %xmm4, %xmm0 4447; SSE-NEXT: movdqa 112(%rsi), %xmm4 4448; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4449; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] 4450; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 4451; SSE-NEXT: pand %xmm9, %xmm11 4452; SSE-NEXT: por %xmm0, %xmm11 4453; SSE-NEXT: movdqa 112(%rcx), %xmm0 4454; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4455; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 4456; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 4457; SSE-NEXT: movdqa %xmm10, %xmm15 4458; SSE-NEXT: pandn %xmm0, %xmm15 4459; SSE-NEXT: movdqa 112(%rdx), %xmm0 4460; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4461; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 4462; SSE-NEXT: pand %xmm10, %xmm0 4463; SSE-NEXT: por %xmm0, %xmm15 4464; SSE-NEXT: pand %xmm1, %xmm15 4465; SSE-NEXT: pandn %xmm11, %xmm1 4466; SSE-NEXT: por %xmm15, %xmm1 4467; SSE-NEXT: pand %xmm2, %xmm1 4468; SSE-NEXT: movdqa 112(%r8), %xmm0 4469; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4470; SSE-NEXT: pandn %xmm0, %xmm2 4471; SSE-NEXT: por %xmm1, %xmm2 4472; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4473; SSE-NEXT: movdqa %xmm7, %xmm0 4474; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4475; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4476; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4477; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4478; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0] 4479; SSE-NEXT: movdqa %xmm1, %xmm11 4480; SSE-NEXT: pandn %xmm0, %xmm11 4481; SSE-NEXT: movdqa %xmm14, %xmm0 4482; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] 4483; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,1,3,2,4,5,6,7] 4484; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,1] 4485; SSE-NEXT: pand %xmm1, %xmm15 4486; SSE-NEXT: por %xmm11, %xmm15 4487; SSE-NEXT: movdqa %xmm9, %xmm13 4488; SSE-NEXT: pand %xmm9, %xmm15 4489; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4490; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] 4491; SSE-NEXT: movdqa %xmm9, %xmm1 4492; SSE-NEXT: pandn %xmm11, %xmm1 4493; SSE-NEXT: por %xmm15, %xmm1 4494; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4495; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 4496; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4497; SSE-NEXT: movdqa %xmm2, %xmm15 4498; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] 4499; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] 4500; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] 4501; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] 4502; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4503; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,65535,0,65535] 4504; SSE-NEXT: movdqa %xmm15, %xmm1 4505; SSE-NEXT: pandn %xmm11, %xmm1 4506; SSE-NEXT: pand %xmm15, %xmm0 4507; SSE-NEXT: por %xmm0, %xmm1 4508; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4509; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 4510; SSE-NEXT: movdqa %xmm7, %xmm1 4511; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4512; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] 4513; SSE-NEXT: movdqa %xmm9, %xmm11 4514; SSE-NEXT: pandn %xmm1, %xmm11 4515; SSE-NEXT: movdqa %xmm14, %xmm1 4516; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 4517; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] 4518; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 4519; SSE-NEXT: pand %xmm9, %xmm1 4520; SSE-NEXT: por %xmm11, %xmm1 4521; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] 4522; SSE-NEXT: pand %xmm3, %xmm1 4523; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 4524; SSE-NEXT: movdqa %xmm3, %xmm11 4525; SSE-NEXT: pandn %xmm0, %xmm11 4526; SSE-NEXT: por %xmm1, %xmm11 4527; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4528; SSE-NEXT: psrlq $48, %xmm12 4529; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm12[1] 4530; SSE-NEXT: movdqa %xmm9, %xmm1 4531; SSE-NEXT: pandn %xmm14, %xmm1 4532; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] 4533; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] 4534; SSE-NEXT: pand %xmm9, %xmm7 4535; SSE-NEXT: por %xmm1, %xmm7 4536; SSE-NEXT: movdqa %xmm10, %xmm1 4537; SSE-NEXT: pandn %xmm0, %xmm1 4538; SSE-NEXT: pand %xmm10, %xmm7 4539; SSE-NEXT: por %xmm7, %xmm1 4540; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4541; SSE-NEXT: movdqa %xmm5, %xmm0 4542; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4543; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4544; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4545; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4546; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] 4547; SSE-NEXT: movdqa %xmm4, %xmm1 4548; SSE-NEXT: pandn %xmm0, %xmm1 4549; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4550; SSE-NEXT: movdqa %xmm12, %xmm0 4551; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4552; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 4553; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7] 4554; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] 4555; SSE-NEXT: pand %xmm4, %xmm7 4556; SSE-NEXT: por %xmm1, %xmm7 4557; SSE-NEXT: pand %xmm13, %xmm7 4558; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4559; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] 4560; SSE-NEXT: movdqa %xmm13, %xmm11 4561; SSE-NEXT: pandn %xmm1, %xmm11 4562; SSE-NEXT: por %xmm7, %xmm11 4563; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4564; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 4565; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4566; SSE-NEXT: movdqa %xmm2, %xmm7 4567; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] 4568; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] 4569; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] 4570; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 4571; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 4572; SSE-NEXT: movdqa %xmm15, %xmm0 4573; SSE-NEXT: pandn %xmm1, %xmm0 4574; SSE-NEXT: pand %xmm15, %xmm7 4575; SSE-NEXT: por %xmm7, %xmm0 4576; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4577; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 4578; SSE-NEXT: movdqa %xmm5, %xmm0 4579; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4580; SSE-NEXT: movdqa %xmm9, %xmm1 4581; SSE-NEXT: pandn %xmm0, %xmm1 4582; SSE-NEXT: movdqa %xmm12, %xmm11 4583; SSE-NEXT: movdqa %xmm12, %xmm0 4584; SSE-NEXT: movdqa %xmm14, %xmm12 4585; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 4586; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 4587; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 4588; SSE-NEXT: pand %xmm9, %xmm0 4589; SSE-NEXT: por %xmm1, %xmm0 4590; SSE-NEXT: pand %xmm3, %xmm0 4591; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 4592; SSE-NEXT: movdqa %xmm3, %xmm7 4593; SSE-NEXT: movdqa %xmm3, %xmm14 4594; SSE-NEXT: pandn %xmm1, %xmm7 4595; SSE-NEXT: por %xmm0, %xmm7 4596; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4597; SSE-NEXT: movdqa %xmm12, %xmm0 4598; SSE-NEXT: psrlq $48, %xmm0 4599; SSE-NEXT: movdqa %xmm11, %xmm3 4600; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4601; SSE-NEXT: movdqa %xmm9, %xmm0 4602; SSE-NEXT: pandn %xmm3, %xmm0 4603; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] 4604; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 4605; SSE-NEXT: pand %xmm9, %xmm5 4606; SSE-NEXT: por %xmm0, %xmm5 4607; SSE-NEXT: movdqa %xmm10, %xmm0 4608; SSE-NEXT: pandn %xmm1, %xmm0 4609; SSE-NEXT: pand %xmm10, %xmm5 4610; SSE-NEXT: por %xmm5, %xmm0 4611; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4612; SSE-NEXT: movdqa %xmm6, %xmm0 4613; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4614; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] 4615; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4616; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4617; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] 4618; SSE-NEXT: movdqa %xmm4, %xmm1 4619; SSE-NEXT: pandn %xmm0, %xmm1 4620; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4621; SSE-NEXT: movdqa %xmm7, %xmm0 4622; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4623; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 4624; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] 4625; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] 4626; SSE-NEXT: pand %xmm4, %xmm5 4627; SSE-NEXT: por %xmm1, %xmm5 4628; SSE-NEXT: movdqa %xmm13, %xmm2 4629; SSE-NEXT: pand %xmm13, %xmm5 4630; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4631; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] 4632; SSE-NEXT: movdqa %xmm2, %xmm3 4633; SSE-NEXT: pandn %xmm1, %xmm3 4634; SSE-NEXT: por %xmm5, %xmm3 4635; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4636; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 4637; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4638; SSE-NEXT: movdqa %xmm12, %xmm5 4639; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 4640; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 4641; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 4642; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4643; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 4644; SSE-NEXT: movdqa %xmm15, %xmm0 4645; SSE-NEXT: pandn %xmm1, %xmm0 4646; SSE-NEXT: pand %xmm15, %xmm5 4647; SSE-NEXT: por %xmm5, %xmm0 4648; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4649; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 4650; SSE-NEXT: movdqa %xmm6, %xmm0 4651; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4652; SSE-NEXT: movdqa %xmm9, %xmm1 4653; SSE-NEXT: pandn %xmm0, %xmm1 4654; SSE-NEXT: movdqa %xmm7, %xmm3 4655; SSE-NEXT: movdqa %xmm7, %xmm0 4656; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 4657; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 4658; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 4659; SSE-NEXT: pand %xmm9, %xmm0 4660; SSE-NEXT: por %xmm1, %xmm0 4661; SSE-NEXT: pand %xmm14, %xmm0 4662; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] 4663; SSE-NEXT: movdqa %xmm14, %xmm7 4664; SSE-NEXT: pandn %xmm1, %xmm7 4665; SSE-NEXT: por %xmm0, %xmm7 4666; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4667; SSE-NEXT: movdqa %xmm11, %xmm0 4668; SSE-NEXT: psrlq $48, %xmm0 4669; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4670; SSE-NEXT: movdqa %xmm9, %xmm0 4671; SSE-NEXT: pandn %xmm3, %xmm0 4672; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] 4673; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 4674; SSE-NEXT: pand %xmm9, %xmm5 4675; SSE-NEXT: por %xmm0, %xmm5 4676; SSE-NEXT: movdqa %xmm10, %xmm0 4677; SSE-NEXT: pandn %xmm1, %xmm0 4678; SSE-NEXT: pand %xmm10, %xmm5 4679; SSE-NEXT: por %xmm5, %xmm0 4680; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4681; SSE-NEXT: movdqa %xmm8, %xmm0 4682; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4683; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 4684; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4685; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4686; SSE-NEXT: movdqa %xmm4, %xmm1 4687; SSE-NEXT: pandn %xmm0, %xmm1 4688; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4689; SSE-NEXT: movdqa %xmm3, %xmm0 4690; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4691; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 4692; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] 4693; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] 4694; SSE-NEXT: pand %xmm4, %xmm5 4695; SSE-NEXT: por %xmm1, %xmm5 4696; SSE-NEXT: pand %xmm2, %xmm5 4697; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4698; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] 4699; SSE-NEXT: movdqa %xmm2, %xmm12 4700; SSE-NEXT: pandn %xmm1, %xmm12 4701; SSE-NEXT: por %xmm5, %xmm12 4702; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4703; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 4704; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4705; SSE-NEXT: movdqa %xmm7, %xmm5 4706; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] 4707; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 4708; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 4709; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4710; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 4711; SSE-NEXT: movdqa %xmm15, %xmm0 4712; SSE-NEXT: pandn %xmm1, %xmm0 4713; SSE-NEXT: pand %xmm15, %xmm5 4714; SSE-NEXT: por %xmm5, %xmm0 4715; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4716; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 4717; SSE-NEXT: movdqa %xmm8, %xmm0 4718; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4719; SSE-NEXT: movdqa %xmm9, %xmm1 4720; SSE-NEXT: pandn %xmm0, %xmm1 4721; SSE-NEXT: movdqa %xmm3, %xmm0 4722; SSE-NEXT: movdqa %xmm6, %xmm5 4723; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 4724; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 4725; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 4726; SSE-NEXT: pand %xmm9, %xmm0 4727; SSE-NEXT: por %xmm1, %xmm0 4728; SSE-NEXT: pand %xmm14, %xmm0 4729; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] 4730; SSE-NEXT: movdqa %xmm14, %xmm6 4731; SSE-NEXT: pandn %xmm1, %xmm6 4732; SSE-NEXT: por %xmm0, %xmm6 4733; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4734; SSE-NEXT: movdqa %xmm5, %xmm0 4735; SSE-NEXT: psrlq $48, %xmm0 4736; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4737; SSE-NEXT: movdqa %xmm9, %xmm0 4738; SSE-NEXT: pandn %xmm3, %xmm0 4739; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,7,6] 4740; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] 4741; SSE-NEXT: pand %xmm9, %xmm5 4742; SSE-NEXT: por %xmm0, %xmm5 4743; SSE-NEXT: movdqa %xmm10, %xmm0 4744; SSE-NEXT: pandn %xmm1, %xmm0 4745; SSE-NEXT: pand %xmm10, %xmm5 4746; SSE-NEXT: por %xmm5, %xmm0 4747; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4748; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4749; SSE-NEXT: movdqa %xmm11, %xmm0 4750; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4751; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 4752; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4753; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4754; SSE-NEXT: movdqa %xmm4, %xmm1 4755; SSE-NEXT: pandn %xmm0, %xmm1 4756; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4757; SSE-NEXT: movdqa %xmm3, %xmm0 4758; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4759; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 4760; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] 4761; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] 4762; SSE-NEXT: pand %xmm4, %xmm5 4763; SSE-NEXT: por %xmm1, %xmm5 4764; SSE-NEXT: pand %xmm2, %xmm5 4765; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload 4766; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] 4767; SSE-NEXT: movdqa %xmm2, %xmm12 4768; SSE-NEXT: pandn %xmm1, %xmm12 4769; SSE-NEXT: por %xmm5, %xmm12 4770; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4771; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 4772; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4773; SSE-NEXT: movdqa %xmm6, %xmm5 4774; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] 4775; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 4776; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 4777; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4778; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 4779; SSE-NEXT: movdqa %xmm15, %xmm0 4780; SSE-NEXT: pandn %xmm1, %xmm0 4781; SSE-NEXT: pand %xmm15, %xmm5 4782; SSE-NEXT: por %xmm5, %xmm0 4783; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4784; SSE-NEXT: movdqa %xmm11, %xmm1 4785; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 4786; SSE-NEXT: movdqa %xmm1, %xmm0 4787; SSE-NEXT: movdqa %xmm1, %xmm6 4788; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4789; SSE-NEXT: movdqa %xmm9, %xmm1 4790; SSE-NEXT: pandn %xmm0, %xmm1 4791; SSE-NEXT: movdqa %xmm3, %xmm0 4792; SSE-NEXT: movdqa %xmm7, %xmm5 4793; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 4794; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 4795; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 4796; SSE-NEXT: pand %xmm9, %xmm0 4797; SSE-NEXT: por %xmm1, %xmm0 4798; SSE-NEXT: pand %xmm14, %xmm0 4799; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] 4800; SSE-NEXT: movdqa %xmm14, %xmm7 4801; SSE-NEXT: pandn %xmm1, %xmm7 4802; SSE-NEXT: por %xmm0, %xmm7 4803; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4804; SSE-NEXT: movdqa %xmm5, %xmm0 4805; SSE-NEXT: psrlq $48, %xmm0 4806; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4807; SSE-NEXT: movdqa %xmm9, %xmm0 4808; SSE-NEXT: pandn %xmm3, %xmm0 4809; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] 4810; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 4811; SSE-NEXT: pand %xmm9, %xmm3 4812; SSE-NEXT: por %xmm0, %xmm3 4813; SSE-NEXT: movdqa %xmm10, %xmm0 4814; SSE-NEXT: pandn %xmm1, %xmm0 4815; SSE-NEXT: pand %xmm10, %xmm3 4816; SSE-NEXT: por %xmm3, %xmm0 4817; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4818; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4819; SSE-NEXT: movdqa %xmm12, %xmm0 4820; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4821; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 4822; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 4823; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 4824; SSE-NEXT: movdqa %xmm4, %xmm1 4825; SSE-NEXT: pandn %xmm0, %xmm1 4826; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4827; SSE-NEXT: movdqa %xmm7, %xmm3 4828; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4829; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 4830; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,2,4,5,6,7] 4831; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,1] 4832; SSE-NEXT: pand %xmm4, %xmm5 4833; SSE-NEXT: por %xmm1, %xmm5 4834; SSE-NEXT: pand %xmm2, %xmm5 4835; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4836; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] 4837; SSE-NEXT: movdqa %xmm2, %xmm0 4838; SSE-NEXT: pandn %xmm1, %xmm0 4839; SSE-NEXT: por %xmm5, %xmm0 4840; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4841; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] 4842; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 4843; SSE-NEXT: movdqa %xmm6, %xmm5 4844; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] 4845; SSE-NEXT: movdqa %xmm12, %xmm0 4846; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 4847; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 4848; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 4849; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 4850; SSE-NEXT: movdqa %xmm15, %xmm3 4851; SSE-NEXT: pandn %xmm1, %xmm3 4852; SSE-NEXT: pand %xmm15, %xmm5 4853; SSE-NEXT: movdqa %xmm15, %xmm13 4854; SSE-NEXT: por %xmm5, %xmm3 4855; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 4856; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 4857; SSE-NEXT: movdqa %xmm0, %xmm1 4858; SSE-NEXT: movdqa %xmm0, %xmm6 4859; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4860; SSE-NEXT: movdqa %xmm9, %xmm3 4861; SSE-NEXT: pandn %xmm1, %xmm3 4862; SSE-NEXT: movdqa %xmm7, %xmm0 4863; SSE-NEXT: movdqa %xmm7, %xmm1 4864; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 4865; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] 4866; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 4867; SSE-NEXT: pand %xmm9, %xmm1 4868; SSE-NEXT: por %xmm3, %xmm1 4869; SSE-NEXT: pand %xmm14, %xmm1 4870; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] 4871; SSE-NEXT: movdqa %xmm14, %xmm7 4872; SSE-NEXT: pandn %xmm3, %xmm7 4873; SSE-NEXT: por %xmm1, %xmm7 4874; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4875; SSE-NEXT: movdqa %xmm8, %xmm1 4876; SSE-NEXT: psrlq $48, %xmm1 4877; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 4878; SSE-NEXT: movdqa %xmm9, %xmm1 4879; SSE-NEXT: pandn %xmm0, %xmm1 4880; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] 4881; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,3,3] 4882; SSE-NEXT: pand %xmm9, %xmm8 4883; SSE-NEXT: por %xmm1, %xmm8 4884; SSE-NEXT: movdqa %xmm10, %xmm0 4885; SSE-NEXT: pandn %xmm3, %xmm0 4886; SSE-NEXT: pand %xmm10, %xmm8 4887; SSE-NEXT: por %xmm8, %xmm0 4888; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4889; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 4890; SSE-NEXT: movdqa %xmm11, %xmm1 4891; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4892; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4893; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 4894; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 4895; SSE-NEXT: movdqa %xmm4, %xmm3 4896; SSE-NEXT: pandn %xmm1, %xmm3 4897; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4898; SSE-NEXT: movdqa %xmm5, %xmm8 4899; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4900; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] 4901; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,3,2,4,5,6,7] 4902; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,1,1] 4903; SSE-NEXT: pand %xmm4, %xmm14 4904; SSE-NEXT: por %xmm3, %xmm14 4905; SSE-NEXT: pand %xmm2, %xmm14 4906; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4907; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] 4908; SSE-NEXT: movdqa %xmm2, %xmm15 4909; SSE-NEXT: pandn %xmm3, %xmm15 4910; SSE-NEXT: por %xmm14, %xmm15 4911; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] 4912; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] 4913; SSE-NEXT: movdqa %xmm0, %xmm14 4914; SSE-NEXT: movdqa %xmm0, %xmm6 4915; SSE-NEXT: movdqa %xmm11, %xmm1 4916; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 4917; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] 4918; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] 4919; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] 4920; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 4921; SSE-NEXT: movdqa %xmm13, %xmm14 4922; SSE-NEXT: pandn %xmm3, %xmm14 4923; SSE-NEXT: pand %xmm13, %xmm0 4924; SSE-NEXT: por %xmm0, %xmm14 4925; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 4926; SSE-NEXT: movdqa %xmm1, %xmm0 4927; SSE-NEXT: movdqa %xmm1, %xmm6 4928; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4929; SSE-NEXT: movdqa %xmm9, %xmm3 4930; SSE-NEXT: pandn %xmm0, %xmm3 4931; SSE-NEXT: movdqa %xmm5, %xmm0 4932; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 4933; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 4934; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 4935; SSE-NEXT: pand %xmm9, %xmm0 4936; SSE-NEXT: por %xmm3, %xmm0 4937; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,65535,65535] 4938; SSE-NEXT: pand %xmm11, %xmm0 4939; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] 4940; SSE-NEXT: pandn %xmm1, %xmm11 4941; SSE-NEXT: por %xmm0, %xmm11 4942; SSE-NEXT: movdqa %xmm12, %xmm0 4943; SSE-NEXT: psrlq $48, %xmm0 4944; SSE-NEXT: movdqa %xmm5, %xmm3 4945; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] 4946; SSE-NEXT: movdqa %xmm9, %xmm0 4947; SSE-NEXT: pandn %xmm3, %xmm0 4948; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] 4949; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 4950; SSE-NEXT: pand %xmm9, %xmm3 4951; SSE-NEXT: por %xmm0, %xmm3 4952; SSE-NEXT: movdqa %xmm10, %xmm12 4953; SSE-NEXT: pandn %xmm1, %xmm12 4954; SSE-NEXT: pand %xmm10, %xmm3 4955; SSE-NEXT: por %xmm3, %xmm12 4956; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4957; SSE-NEXT: movdqa %xmm0, %xmm1 4958; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4959; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 4960; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 4961; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 4962; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4963; SSE-NEXT: movdqa %xmm8, %xmm3 4964; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4965; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 4966; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,2,4,5,6,7] 4967; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] 4968; SSE-NEXT: pand %xmm4, %xmm13 4969; SSE-NEXT: pandn %xmm1, %xmm4 4970; SSE-NEXT: por %xmm13, %xmm4 4971; SSE-NEXT: pand %xmm2, %xmm4 4972; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4973; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] 4974; SSE-NEXT: pandn %xmm1, %xmm2 4975; SSE-NEXT: por %xmm4, %xmm2 4976; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] 4977; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 4978; SSE-NEXT: movdqa %xmm7, %xmm4 4979; SSE-NEXT: movdqa %xmm0, %xmm13 4980; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 4981; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] 4982; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] 4983; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 4984; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 4985; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] 4986; SSE-NEXT: pand %xmm0, %xmm4 4987; SSE-NEXT: pandn %xmm1, %xmm0 4988; SSE-NEXT: por %xmm4, %xmm0 4989; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4990; SSE-NEXT: movdqa %xmm13, %xmm3 4991; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 4992; SSE-NEXT: movdqa %xmm3, %xmm1 4993; SSE-NEXT: movdqa %xmm3, %xmm7 4994; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4995; SSE-NEXT: movdqa %xmm9, %xmm3 4996; SSE-NEXT: pandn %xmm1, %xmm3 4997; SSE-NEXT: movdqa %xmm8, %xmm1 4998; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 4999; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] 5000; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 5001; SSE-NEXT: pand %xmm9, %xmm1 5002; SSE-NEXT: por %xmm3, %xmm1 5003; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,0,65535,65535] 5004; SSE-NEXT: pand %xmm0, %xmm1 5005; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 5006; SSE-NEXT: pandn %xmm3, %xmm0 5007; SSE-NEXT: por %xmm1, %xmm0 5008; SSE-NEXT: psrlq $48, %xmm6 5009; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm6[1] 5010; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6] 5011; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 5012; SSE-NEXT: pand %xmm9, %xmm1 5013; SSE-NEXT: pandn %xmm8, %xmm9 5014; SSE-NEXT: por %xmm1, %xmm9 5015; SSE-NEXT: pand %xmm10, %xmm9 5016; SSE-NEXT: pandn %xmm3, %xmm10 5017; SSE-NEXT: por %xmm9, %xmm10 5018; SSE-NEXT: movdqa %xmm10, 624(%r9) 5019; SSE-NEXT: movdqa %xmm0, 608(%r9) 5020; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5021; SSE-NEXT: movaps %xmm0, 576(%r9) 5022; SSE-NEXT: movdqa %xmm2, 560(%r9) 5023; SSE-NEXT: movdqa %xmm12, 544(%r9) 5024; SSE-NEXT: movdqa %xmm11, 528(%r9) 5025; SSE-NEXT: movdqa %xmm14, 496(%r9) 5026; SSE-NEXT: movdqa %xmm15, 480(%r9) 5027; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5028; SSE-NEXT: movaps %xmm0, 464(%r9) 5029; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5030; SSE-NEXT: movaps %xmm0, 448(%r9) 5031; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 5032; SSE-NEXT: movaps %xmm0, 416(%r9) 5033; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5034; SSE-NEXT: movaps %xmm0, 400(%r9) 5035; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5036; SSE-NEXT: movaps %xmm0, 384(%r9) 5037; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5038; SSE-NEXT: movaps %xmm0, 368(%r9) 5039; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5040; SSE-NEXT: movaps %xmm0, 336(%r9) 5041; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5042; SSE-NEXT: movaps %xmm0, 320(%r9) 5043; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5044; SSE-NEXT: movaps %xmm0, 304(%r9) 5045; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5046; SSE-NEXT: movaps %xmm0, 288(%r9) 5047; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5048; SSE-NEXT: movaps %xmm0, 256(%r9) 5049; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5050; SSE-NEXT: movaps %xmm0, 240(%r9) 5051; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5052; SSE-NEXT: movaps %xmm0, 224(%r9) 5053; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5054; SSE-NEXT: movaps %xmm0, 208(%r9) 5055; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5056; SSE-NEXT: movaps %xmm0, 176(%r9) 5057; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5058; SSE-NEXT: movaps %xmm0, 160(%r9) 5059; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5060; SSE-NEXT: movaps %xmm0, 144(%r9) 5061; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5062; SSE-NEXT: movaps %xmm0, 128(%r9) 5063; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5064; SSE-NEXT: movaps %xmm0, 96(%r9) 5065; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5066; SSE-NEXT: movaps %xmm0, 80(%r9) 5067; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5068; SSE-NEXT: movaps %xmm0, 64(%r9) 5069; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5070; SSE-NEXT: movaps %xmm0, 48(%r9) 5071; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5072; SSE-NEXT: movaps %xmm0, 16(%r9) 5073; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5074; SSE-NEXT: movaps %xmm0, (%r9) 5075; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5076; SSE-NEXT: movaps %xmm0, 592(%r9) 5077; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5078; SSE-NEXT: movaps %xmm0, 512(%r9) 5079; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5080; SSE-NEXT: movaps %xmm0, 432(%r9) 5081; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5082; SSE-NEXT: movaps %xmm0, 352(%r9) 5083; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5084; SSE-NEXT: movaps %xmm0, 272(%r9) 5085; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5086; SSE-NEXT: movaps %xmm0, 192(%r9) 5087; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5088; SSE-NEXT: movaps %xmm0, 112(%r9) 5089; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5090; SSE-NEXT: movaps %xmm0, 32(%r9) 5091; SSE-NEXT: addq $616, %rsp # imm = 0x268 5092; SSE-NEXT: retq 5093; 5094; AVX-LABEL: store_i16_stride5_vf64: 5095; AVX: # %bb.0: 5096; AVX-NEXT: subq $392, %rsp # imm = 0x188 5097; AVX-NEXT: vmovdqa 80(%rcx), %xmm4 5098; AVX-NEXT: vmovdqa 80(%rdx), %xmm6 5099; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 5100; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5101; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 5102; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 5103; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 5104; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] 5105; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 5106; AVX-NEXT: vmovdqa 80(%rdi), %xmm8 5107; AVX-NEXT: vmovdqa 80(%rsi), %xmm10 5108; AVX-NEXT: vpsrlq $48, %xmm10, %xmm1 5109; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] 5110; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 5111; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] 5112; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] 5113; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 5114; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 5115; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 5116; AVX-NEXT: vmovdqa 80(%r8), %xmm9 5117; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] 5118; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] 5119; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5120; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5121; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] 5122; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5123; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 5124; AVX-NEXT: vmovdqa 64(%rsi), %xmm1 5125; AVX-NEXT: vpsrlq $48, %xmm1, %xmm2 5126; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] 5127; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] 5128; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,2,4,5,6,7] 5129; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] 5130; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 5131; AVX-NEXT: vmovdqa 64(%rcx), %xmm3 5132; AVX-NEXT: vmovdqa 64(%rdx), %xmm5 5133; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 5134; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,7,6] 5135; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] 5136; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 5137; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] 5138; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] 5139; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 5140; AVX-NEXT: vandnps %ymm7, %ymm12, %ymm7 5141; AVX-NEXT: vandps %ymm12, %ymm13, %ymm13 5142; AVX-NEXT: vorps %ymm7, %ymm13, %ymm7 5143; AVX-NEXT: vextractf128 $1, %ymm7, %xmm13 5144; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] 5145; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] 5146; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] 5147; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5,6,6] 5148; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 5149; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 5150; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] 5151; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] 5152; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] 5153; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] 5154; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] 5155; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] 5156; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5157; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] 5158; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 5159; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm6 5160; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] 5161; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] 5162; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 5163; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 5164; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4 5165; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] 5166; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5167; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5168; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3],xmm4[4,5,6,7] 5169; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5170; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 5171; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] 5172; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 5173; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 5174; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] 5175; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] 5176; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 5177; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 5178; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] 5179; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] 5180; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] 5181; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 5182; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 5183; AVX-NEXT: vmovdqa 64(%r8), %xmm4 5184; AVX-NEXT: vandnps %ymm6, %ymm15, %ymm6 5185; AVX-NEXT: vandps %ymm15, %ymm8, %ymm8 5186; AVX-NEXT: vorps %ymm6, %ymm8, %ymm6 5187; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] 5188; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm8[4],xmm6[5,6,7] 5189; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5190; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 5191; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] 5192; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5193; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] 5194; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] 5195; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5196; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] 5197; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 5198; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] 5199; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5200; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5201; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 5202; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5203; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5204; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] 5205; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5206; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 5207; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm1 5208; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] 5209; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 5210; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 5211; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 5212; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 5213; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] 5214; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5215; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5216; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5],xmm0[6,7] 5217; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5218; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 5219; AVX-NEXT: vmovdqa 48(%rsi), %xmm5 5220; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] 5221; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5222; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] 5223; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 5224; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] 5225; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5226; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 5227; AVX-NEXT: vmovdqa 48(%rcx), %xmm1 5228; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5229; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] 5230; AVX-NEXT: vmovdqa 48(%rdx), %xmm3 5231; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] 5232; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] 5233; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 5234; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] 5235; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] 5236; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5237; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 5238; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 5239; AVX-NEXT: vorps %ymm2, %ymm6, %ymm2 5240; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5241; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5242; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] 5243; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] 5244; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5245; AVX-NEXT: vpsrlq $48, %xmm5, %xmm7 5246; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm4[1],xmm7[1] 5247; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 5248; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] 5249; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] 5250; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 5251; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm5 5252; AVX-NEXT: vandps %ymm4, %ymm12, %ymm4 5253; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 5254; AVX-NEXT: vmovdqa 48(%r8), %xmm11 5255; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] 5256; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] 5257; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5258; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5259; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] 5260; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5261; AVX-NEXT: vmovdqa 32(%rcx), %xmm4 5262; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] 5263; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 5264; AVX-NEXT: vmovdqa 32(%rdx), %xmm6 5265; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] 5266; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] 5267; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 5268; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5269; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 5270; AVX-NEXT: vmovdqa 32(%rsi), %xmm5 5271; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] 5272; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,4,4,4,4] 5273; AVX-NEXT: vmovdqa 32(%rdi), %xmm9 5274; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm9[4],xmm13[5,6,7] 5275; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] 5276; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] 5277; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] 5278; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 5279; AVX-NEXT: vandnps %ymm7, %ymm10, %ymm7 5280; AVX-NEXT: vandps %ymm10, %ymm13, %ymm13 5281; AVX-NEXT: vorps %ymm7, %ymm13, %ymm7 5282; AVX-NEXT: vpsrlq $48, %xmm5, %xmm13 5283; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm13[1] 5284; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] 5285; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] 5286; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 5287; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 5288; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] 5289; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5290; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 5291; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 5292; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5293; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 5294; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 5295; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 5296; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 5297; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] 5298; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5299; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] 5300; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] 5301; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5302; AVX-NEXT: vmovdqa 32(%r8), %xmm2 5303; AVX-NEXT: vextractf128 $1, %ymm7, %xmm3 5304; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] 5305; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] 5306; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5307; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] 5308; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5309; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5310; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5311; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5312; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 5313; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] 5314; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 5315; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5316; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 5317; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] 5318; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 5319; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] 5320; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] 5321; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] 5322; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] 5323; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5324; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5325; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 5326; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1 5327; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 5328; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] 5329; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5330; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] 5331; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5332; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5333; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5334; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] 5335; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5336; AVX-NEXT: vmovdqa 112(%rsi), %xmm4 5337; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] 5338; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5339; AVX-NEXT: vmovdqa 112(%rdi), %xmm5 5340; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] 5341; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 5342; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] 5343; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5344; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 5345; AVX-NEXT: vmovdqa 112(%rcx), %xmm1 5346; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5347; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] 5348; AVX-NEXT: vmovdqa 112(%rdx), %xmm3 5349; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] 5350; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] 5351; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 5352; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] 5353; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] 5354; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5355; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 5356; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 5357; AVX-NEXT: vorps %ymm2, %ymm6, %ymm2 5358; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5359; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5360; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] 5361; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] 5362; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5363; AVX-NEXT: vpsrlq $48, %xmm4, %xmm7 5364; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm5[1],xmm7[1] 5365; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 5366; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] 5367; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] 5368; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 5369; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm5 5370; AVX-NEXT: vandps %ymm4, %ymm12, %ymm4 5371; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 5372; AVX-NEXT: vmovdqa 112(%r8), %xmm7 5373; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 5374; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] 5375; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5376; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5377; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] 5378; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5379; AVX-NEXT: vmovdqa 96(%rcx), %xmm4 5380; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] 5381; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] 5382; AVX-NEXT: vmovdqa 96(%rdx), %xmm5 5383; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] 5384; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] 5385; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 5386; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5387; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 5388; AVX-NEXT: vmovdqa 96(%rsi), %xmm6 5389; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[3,3,3,3,4,5,6,7] 5390; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,4,4] 5391; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 5392; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6,7] 5393; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] 5394; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] 5395; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] 5396; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 5397; AVX-NEXT: vandnps %ymm9, %ymm10, %ymm9 5398; AVX-NEXT: vandps %ymm10, %ymm13, %ymm13 5399; AVX-NEXT: vorps %ymm9, %ymm13, %ymm9 5400; AVX-NEXT: vpsrlq $48, %xmm6, %xmm13 5401; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm11[1],xmm13[1] 5402; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] 5403; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] 5404; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 5405; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 5406; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] 5407; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5408; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] 5409; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 5410; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5411; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 5412; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 5413; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 5414; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 5415; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] 5416; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5417; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] 5418; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] 5419; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 5420; AVX-NEXT: vmovdqa 96(%r8), %xmm2 5421; AVX-NEXT: vextractf128 $1, %ymm9, %xmm3 5422; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 5423; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] 5424; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5425; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] 5426; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5427; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5428; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5429; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5430; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 5431; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 5432; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] 5433; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5434; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 5435; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] 5436; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 5437; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] 5438; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] 5439; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] 5440; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] 5441; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5442; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 5443; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 5444; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1 5445; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 5446; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] 5447; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5448; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] 5449; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5450; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5451; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 5452; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] 5453; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5454; AVX-NEXT: vmovdqa 16(%rdx), %xmm8 5455; AVX-NEXT: vmovdqa 16(%rcx), %xmm7 5456; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 5457; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5458; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] 5459; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 5460; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 5461; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 5462; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 5463; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5 5464; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] 5465; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 5466; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] 5467; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] 5468; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 5469; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4 5470; AVX-NEXT: vandps %ymm5, %ymm12, %ymm5 5471; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 5472; AVX-NEXT: vmovdqa 16(%r8), %xmm6 5473; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 5474; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] 5475; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5476; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5477; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] 5478; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5479; AVX-NEXT: vmovdqa (%rdi), %xmm10 5480; AVX-NEXT: vmovdqa (%rsi), %xmm9 5481; AVX-NEXT: vpsrlq $48, %xmm9, %xmm4 5482; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm10[1],xmm4[1] 5483; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 5484; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] 5485; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] 5486; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 5487; AVX-NEXT: vmovdqa (%rdx), %xmm4 5488; AVX-NEXT: vmovdqa (%rcx), %xmm5 5489; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 5490; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] 5491; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] 5492; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 5493; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] 5494; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] 5495; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 5496; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 5497; AVX-NEXT: vandps %ymm12, %ymm13, %ymm12 5498; AVX-NEXT: vorps %ymm3, %ymm12, %ymm12 5499; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5500; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5501; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5502; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] 5503; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5504; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 5505; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] 5506; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 5507; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] 5508; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] 5509; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 5510; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] 5511; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 5512; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 5513; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 5514; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1 5515; AVX-NEXT: vorps %ymm0, %ymm1, %ymm2 5516; AVX-NEXT: vextractf128 $1, %ymm12, %xmm0 5517; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] 5518; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] 5519; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] 5520; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 5521; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5,6,7] 5522; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 5523; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] 5524; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 5525; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 5526; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] 5527; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] 5528; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 5529; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 5530; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] 5531; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] 5532; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] 5533; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5534; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 5535; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3 5536; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 5537; AVX-NEXT: vorps %ymm3, %ymm6, %ymm3 5538; AVX-NEXT: vmovdqa (%r8), %xmm6 5539; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] 5540; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm7[4],xmm3[5,6,7] 5541; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 5542; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] 5543; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] 5544; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] 5545; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 5546; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] 5547; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 5548; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 5549; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] 5550; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] 5551; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5,6,7] 5552; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 5553; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] 5554; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] 5555; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 5556; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 5557; AVX-NEXT: vandnps %ymm4, %ymm7, %ymm4 5558; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5 5559; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 5560; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 5561; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] 5562; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] 5563; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 5564; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] 5565; AVX-NEXT: vmovdqa %xmm4, 48(%r9) 5566; AVX-NEXT: vmovdqa %xmm6, 32(%r9) 5567; AVX-NEXT: vmovdqa %xmm3, 16(%r9) 5568; AVX-NEXT: vmovdqa %xmm8, (%r9) 5569; AVX-NEXT: vmovdqa %xmm2, 112(%r9) 5570; AVX-NEXT: vmovdqa %xmm1, 96(%r9) 5571; AVX-NEXT: vmovdqa %xmm0, 80(%r9) 5572; AVX-NEXT: vmovdqa %xmm7, 64(%r9) 5573; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5574; AVX-NEXT: vmovaps %xmm0, 144(%r9) 5575; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5576; AVX-NEXT: vmovaps %xmm0, 128(%r9) 5577; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5578; AVX-NEXT: vmovaps %xmm0, 496(%r9) 5579; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5580; AVX-NEXT: vmovaps %xmm0, 480(%r9) 5581; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5582; AVX-NEXT: vmovaps %xmm0, 560(%r9) 5583; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5584; AVX-NEXT: vmovaps %xmm0, 544(%r9) 5585; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5586; AVX-NEXT: vmovaps %xmm0, 528(%r9) 5587; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5588; AVX-NEXT: vmovaps %xmm0, 512(%r9) 5589; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5590; AVX-NEXT: vmovaps %xmm0, 624(%r9) 5591; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5592; AVX-NEXT: vmovaps %xmm0, 608(%r9) 5593; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5594; AVX-NEXT: vmovaps %xmm0, 592(%r9) 5595; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 5596; AVX-NEXT: vmovaps %xmm0, 576(%r9) 5597; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5598; AVX-NEXT: vmovaps %xmm0, 176(%r9) 5599; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5600; AVX-NEXT: vmovaps %xmm0, 160(%r9) 5601; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5602; AVX-NEXT: vmovaps %xmm0, 240(%r9) 5603; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5604; AVX-NEXT: vmovaps %xmm0, 224(%r9) 5605; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5606; AVX-NEXT: vmovaps %xmm0, 208(%r9) 5607; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5608; AVX-NEXT: vmovaps %xmm0, 192(%r9) 5609; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5610; AVX-NEXT: vmovaps %xmm0, 304(%r9) 5611; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5612; AVX-NEXT: vmovaps %xmm0, 288(%r9) 5613; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5614; AVX-NEXT: vmovaps %xmm0, 272(%r9) 5615; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5616; AVX-NEXT: vmovaps %xmm0, 256(%r9) 5617; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5618; AVX-NEXT: vmovaps %xmm0, 368(%r9) 5619; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5620; AVX-NEXT: vmovaps %xmm0, 352(%r9) 5621; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5622; AVX-NEXT: vmovaps %xmm0, 336(%r9) 5623; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5624; AVX-NEXT: vmovaps %xmm0, 320(%r9) 5625; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5626; AVX-NEXT: vmovaps %xmm0, 432(%r9) 5627; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5628; AVX-NEXT: vmovaps %xmm0, 416(%r9) 5629; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5630; AVX-NEXT: vmovaps %xmm0, 400(%r9) 5631; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5632; AVX-NEXT: vmovaps %xmm0, 384(%r9) 5633; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5634; AVX-NEXT: vmovaps %xmm0, 464(%r9) 5635; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5636; AVX-NEXT: vmovaps %xmm0, 448(%r9) 5637; AVX-NEXT: addq $392, %rsp # imm = 0x188 5638; AVX-NEXT: vzeroupper 5639; AVX-NEXT: retq 5640; 5641; AVX2-LABEL: store_i16_stride5_vf64: 5642; AVX2: # %bb.0: 5643; AVX2-NEXT: subq $968, %rsp # imm = 0x3C8 5644; AVX2-NEXT: vmovdqa (%rdx), %xmm3 5645; AVX2-NEXT: vmovdqa 32(%rdx), %xmm12 5646; AVX2-NEXT: vmovdqa 64(%rdx), %xmm4 5647; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5648; AVX2-NEXT: vmovdqa (%rcx), %xmm9 5649; AVX2-NEXT: vmovdqa 32(%rcx), %xmm0 5650; AVX2-NEXT: vmovdqa 64(%rcx), %xmm5 5651; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5652; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] 5653; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 5654; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5655; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 5656; AVX2-NEXT: vmovdqa (%rsi), %xmm11 5657; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 5658; AVX2-NEXT: vmovdqa 64(%rsi), %xmm6 5659; AVX2-NEXT: vmovdqa (%rdi), %xmm7 5660; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 5661; AVX2-NEXT: vmovdqa 64(%rdi), %xmm15 5662; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] 5663; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 5664; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 5665; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 5666; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 5667; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 5668; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] 5669; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5670; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 5671; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] 5672; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 5673; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 5674; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 5675; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm14 5676; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 5677; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5678; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 5679; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] 5680; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 5681; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 5682; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 5683; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm15 5684; AVX2-NEXT: vmovdqa 96(%rdx), %xmm1 5685; AVX2-NEXT: vmovdqa 96(%rcx), %xmm7 5686; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] 5687; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 5688; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,1] 5689; AVX2-NEXT: vmovdqa 96(%rsi), %xmm2 5690; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 5691; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 5692; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] 5693; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] 5694; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 5695; AVX2-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 5696; AVX2-NEXT: vpbroadcastq (%r8), %ymm5 5697; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 5698; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 5699; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5700; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 5701; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm5, %ymm5 5702; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5703; AVX2-NEXT: vpbroadcastq 64(%r8), %ymm5 5704; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm5, %ymm5 5705; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5706; AVX2-NEXT: vpbroadcastq 96(%r8), %ymm5 5707; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 5708; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5709; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 5710; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm4 5711; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm5 5712; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 5713; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 5714; AVX2-NEXT: vpshufb %xmm14, %xmm9, %xmm5 5715; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] 5716; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] 5717; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 5718; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] 5719; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 5720; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 5721; AVX2-NEXT: vpshufb %xmm8, %xmm10, %xmm4 5722; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm5 5723; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 5724; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0 5725; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,2] 5726; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] 5727; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 5728; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] 5729; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 5730; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm4 5731; AVX2-NEXT: vpbroadcastq 72(%rdi), %xmm5 5732; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 5733; AVX2-NEXT: vmovdqa (%rdx), %ymm10 5734; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5735; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5736; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm5 5737; AVX2-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 5738; AVX2-NEXT: # xmm6 = mem[1,2,2,2] 5739; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] 5740; AVX2-NEXT: vmovdqa (%rcx), %ymm12 5741; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5742; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 5743; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] 5744; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 5745; AVX2-NEXT: vmovdqa (%r8), %ymm6 5746; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5747; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 5748; AVX2-NEXT: vpbroadcastq 104(%rdi), %xmm5 5749; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] 5750; AVX2-NEXT: vmovdqa 32(%r8), %ymm8 5751; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5752; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm5 5753; AVX2-NEXT: vmovdqa 64(%r8), %ymm7 5754; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5755; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] 5756; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] 5757; AVX2-NEXT: vmovdqa 96(%r8), %ymm11 5758; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5759; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 5760; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] 5761; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 5762; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] 5763; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 5764; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 5765; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5766; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] 5767; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 5768; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5769; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] 5770; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 5771; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5772; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] 5773; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 5774; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5775; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 5776; AVX2-NEXT: # ymm9 = mem[0,1,0,1] 5777; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm1 5778; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] 5779; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 5780; AVX2-NEXT: vmovdqa (%rsi), %ymm7 5781; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 5782; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5783; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 5784; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5785; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5786; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] 5787; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] 5788; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 5789; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 5790; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 5791; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 5792; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 5793; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5794; AVX2-NEXT: vmovdqa 32(%rcx), %ymm1 5795; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5796; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm1 5797; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] 5798; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 5799; AVX2-NEXT: vmovdqa 32(%rsi), %ymm13 5800; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 5801; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5802; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 5803; AVX2-NEXT: vmovdqa 32(%rdi), %ymm12 5804; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,2,1,4,5,6,5] 5805; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5806; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] 5807; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 5808; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 5809; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 5810; AVX2-NEXT: vmovdqa 64(%rdx), %ymm0 5811; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5812; AVX2-NEXT: vmovdqa 64(%rcx), %ymm1 5813; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5814; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm2 5815; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] 5816; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 5817; AVX2-NEXT: vmovdqa 64(%rsi), %ymm11 5818; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 5819; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5820; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 5821; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 5822; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,1,2,1,4,5,6,5] 5823; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5824; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] 5825; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 5826; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 5827; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm1 5828; AVX2-NEXT: vmovdqa 96(%rcx), %ymm0 5829; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5830; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm2 5831; AVX2-NEXT: vmovdqa 96(%rdx), %ymm0 5832; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5833; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] 5834; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 5835; AVX2-NEXT: vmovdqa 96(%rsi), %ymm6 5836; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 5837; AVX2-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill 5838; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 5839; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 5840; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,1,4,5,6,5] 5841; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3],ymm14[4],ymm0[5,6],ymm14[7],ymm0[8,9],ymm14[10],ymm0[11],ymm14[12],ymm0[13,14],ymm14[15] 5842; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 5843; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 5844; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 5845; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm9 5846; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 5847; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 5848; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5849; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm9 5850; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm2 5851; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5852; AVX2-NEXT: vpbroadcastq 80(%r8), %ymm5 5853; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 5854; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5855; AVX2-NEXT: vpbroadcastq 112(%r8), %ymm1 5856; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 5857; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5858; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 5859; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 5860; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 5861; AVX2-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] 5862; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 5863; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 5864; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 5865; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 5866; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5867; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,2,3,3,7,6,7,7] 5868; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] 5869; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 5870; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] 5871; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 5872; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 5873; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5874; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 5875; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 5876; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] 5877; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 5878; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 5879; AVX2-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 5880; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] 5881; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5882; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] 5883; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] 5884; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 5885; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] 5886; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 5887; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 5888; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] 5889; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] 5890; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] 5891; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5892; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 5893; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] 5894; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5895; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,2,3,3,7,6,7,7] 5896; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3,4],ymm5[5,6,7,8],ymm9[9],ymm5[10],ymm9[11,12],ymm5[13,14,15] 5897; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 5898; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] 5899; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 5900; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 5901; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] 5902; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] 5903; AVX2-NEXT: vmovdqa %ymm3, %ymm6 5904; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] 5905; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5906; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 5907; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] 5908; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5909; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,2,3,3,7,6,7,7] 5910; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] 5911; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] 5912; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 5913; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 5914; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 5915; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 5916; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload 5917; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm9 5918; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 5919; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5920; AVX2-NEXT: vpbroadcastq 88(%r8), %ymm0 5921; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 5922; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5923; AVX2-NEXT: vpbroadcastq 120(%r8), %ymm0 5924; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 5925; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5926; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 5927; AVX2-NEXT: vpshufb %ymm0, %ymm10, %ymm1 5928; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] 5929; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] 5930; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 5931; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5932; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm9 5933; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 5934; AVX2-NEXT: # ymm14 = mem[1,1,2,2] 5935; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] 5936; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 5937; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 5938; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5939; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm9 5940; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,0,3,0,7,4,7,4] 5941; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] 5942; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 5943; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm12 5944; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 5945; AVX2-NEXT: # ymm11 = mem[1,1,2,2] 5946; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] 5947; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm9 5948; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm10 5949; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[3,0,3,0,7,4,7,4] 5950; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] 5951; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 5952; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm8 5953; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload 5954; AVX2-NEXT: # ymm7 = mem[1,1,2,2] 5955; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] 5956; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm7 5957; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 5958; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,0,3,0,7,4,7,4] 5959; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] 5960; AVX2-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 5961; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm3 5962; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,1,2,2] 5963; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 5964; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 5965; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 5966; AVX2-NEXT: # ymm2 = mem[1,1,2,2] 5967; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 5968; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 5969; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 5970; AVX2-NEXT: # ymm2 = mem[1,1,2,2] 5971; AVX2-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 5972; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 5973; AVX2-NEXT: # ymm4 = mem[1,1,2,2] 5974; AVX2-NEXT: vpblendvb %ymm3, %ymm7, %ymm4, %ymm4 5975; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 5976; AVX2-NEXT: # ymm5 = mem[1,1,2,2] 5977; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 5978; AVX2-NEXT: vmovdqa %ymm0, 544(%r9) 5979; AVX2-NEXT: vmovdqa %ymm4, 384(%r9) 5980; AVX2-NEXT: vmovdqa %ymm2, 224(%r9) 5981; AVX2-NEXT: vmovdqa %ymm1, 64(%r9) 5982; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5983; AVX2-NEXT: vmovaps %ymm0, 608(%r9) 5984; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5985; AVX2-NEXT: vmovaps %ymm0, 576(%r9) 5986; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5987; AVX2-NEXT: vmovaps %ymm0, 512(%r9) 5988; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5989; AVX2-NEXT: vmovaps %ymm0, 448(%r9) 5990; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5991; AVX2-NEXT: vmovaps %ymm0, 416(%r9) 5992; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5993; AVX2-NEXT: vmovaps %ymm0, 352(%r9) 5994; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5995; AVX2-NEXT: vmovaps %ymm0, 288(%r9) 5996; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5997; AVX2-NEXT: vmovaps %ymm0, 256(%r9) 5998; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5999; AVX2-NEXT: vmovaps %ymm0, 192(%r9) 6000; AVX2-NEXT: vmovdqa %ymm13, 128(%r9) 6001; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6002; AVX2-NEXT: vmovaps %ymm0, 96(%r9) 6003; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6004; AVX2-NEXT: vmovaps %ymm0, 32(%r9) 6005; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6006; AVX2-NEXT: vmovaps %ymm0, 480(%r9) 6007; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6008; AVX2-NEXT: vmovaps %ymm0, 320(%r9) 6009; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6010; AVX2-NEXT: vmovaps %ymm0, 160(%r9) 6011; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6012; AVX2-NEXT: vmovaps %ymm0, (%r9) 6013; AVX2-NEXT: addq $968, %rsp # imm = 0x3C8 6014; AVX2-NEXT: vzeroupper 6015; AVX2-NEXT: retq 6016; 6017; AVX2-FP-LABEL: store_i16_stride5_vf64: 6018; AVX2-FP: # %bb.0: 6019; AVX2-FP-NEXT: subq $936, %rsp # imm = 0x3A8 6020; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 6021; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2 6022; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm3 6023; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 6024; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14 6025; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm13 6026; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] 6027; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 6028; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6029; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6030; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm7 6031; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm10 6032; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6033; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm6 6034; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 6035; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm4 6036; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm1 6037; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6038; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 6039; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 6040; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 6041; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] 6042; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 6043; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 6044; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6045; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] 6046; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6047; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6048; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 6049; AVX2-FP-NEXT: vpshufb %xmm11, %xmm14, %xmm14 6050; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] 6051; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 6052; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] 6053; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6054; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6055; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] 6056; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 6057; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 6058; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 6059; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm13 6060; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm15 6061; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] 6062; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm0 6063; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm9 6064; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm15 6065; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] 6066; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 6067; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6068; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 6069; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 6070; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm10 6071; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 6072; AVX2-FP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 6073; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6074; AVX2-FP-NEXT: vpbroadcastq 32(%r8), %ymm10 6075; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 6076; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6077; AVX2-FP-NEXT: vpbroadcastq 64(%r8), %ymm10 6078; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 6079; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6080; AVX2-FP-NEXT: vpbroadcastq 96(%r8), %ymm1 6081; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 6082; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6083; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 6084; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm0 6085; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm1 6086; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 6087; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 6088; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 6089; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] 6090; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] 6091; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6092; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] 6093; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 6094; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 6095; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 6096; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm2 6097; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 6098; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 6099; AVX2-FP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6100; AVX2-FP-NEXT: # xmm4 = mem[1,2,2,2] 6101; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] 6102; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6103; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 6104; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 6105; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 6106; AVX2-FP-NEXT: vpbroadcastq 72(%rdi), %xmm4 6107; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] 6108; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm12 6109; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6110; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6111; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6112; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] 6113; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 6114; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm14 6115; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6116; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6117; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] 6118; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 6119; AVX2-FP-NEXT: vmovdqa (%r8), %ymm6 6120; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6121; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm3 6122; AVX2-FP-NEXT: vpbroadcastq 104(%rdi), %xmm4 6123; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] 6124; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm8 6125; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6126; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 6127; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm10 6128; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6129; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] 6130; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] 6131; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm9 6132; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6133; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 6134; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] 6135; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 6136; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] 6137; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 6138; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 6139; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6140; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] 6141; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 6142; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6143; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] 6144; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 6145; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6146; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] 6147; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 6148; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6149; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 6150; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] 6151; AVX2-FP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 6152; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] 6153; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 6154; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 6155; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6156; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 6157; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] 6158; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 6159; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 6160; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6161; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] 6162; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] 6163; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6164; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6165; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 6166; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 6167; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm4 6168; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6169; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 6170; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6171; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6172; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] 6173; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 6174; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 6175; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6176; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm6 6177; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 6178; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] 6179; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] 6180; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6181; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6182; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 6183; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm4 6184; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6185; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm2 6186; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6187; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6188; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] 6189; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 6190; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 6191; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill 6192; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm5 6193; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 6194; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6195; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] 6196; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] 6197; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6198; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6199; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 6200; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm2 6201; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6202; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 6203; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm7 6204; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] 6205; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] 6206; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm3 6207; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 6208; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6209; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 6210; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] 6211; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6212; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] 6213; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 6214; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 6215; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 6216; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm13 6217; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 6218; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 6219; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6220; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm13 6221; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 6222; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6223; AVX2-FP-NEXT: vpbroadcastq 80(%r8), %ymm10 6224; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 6225; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6226; AVX2-FP-NEXT: vpbroadcastq 112(%r8), %ymm10 6227; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 6228; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6229; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 6230; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 6231; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm10 6232; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 6233; AVX2-FP-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] 6234; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] 6235; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 6236; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6237; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 6238; AVX2-FP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 6239; AVX2-FP-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] 6240; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] 6241; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] 6242; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] 6243; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 6244; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 6245; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 6246; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6247; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] 6248; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 6249; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6250; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm15 6251; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6252; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] 6253; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] 6254; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 6255; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 6256; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 6257; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 6258; AVX2-FP-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload 6259; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 6260; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 6261; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6262; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm15 6263; AVX2-FP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload 6264; AVX2-FP-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] 6265; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] 6266; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 6267; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 6268; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 6269; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 6270; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] 6271; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] 6272; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6273; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm12 6274; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] 6275; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm3 6276; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] 6277; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 6278; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] 6279; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 6280; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm12 6281; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 6282; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 6283; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6284; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm10 6285; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 6286; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6287; AVX2-FP-NEXT: vpbroadcastq 88(%r8), %ymm10 6288; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 6289; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6290; AVX2-FP-NEXT: vpbroadcastq 120(%r8), %ymm9 6291; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 6292; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 6293; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6294; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm11 6295; AVX2-FP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 6296; AVX2-FP-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] 6297; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] 6298; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 6299; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6300; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 6301; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload 6302; AVX2-FP-NEXT: # ymm13 = mem[1,1,2,2] 6303; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] 6304; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 6305; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 6306; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 6307; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] 6308; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] 6309; AVX2-FP-NEXT: vpshufb %ymm14, %ymm6, %ymm13 6310; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] 6311; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] 6312; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 6313; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 6314; AVX2-FP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 6315; AVX2-FP-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] 6316; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] 6317; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6318; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm5 6319; AVX2-FP-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload 6320; AVX2-FP-NEXT: # ymm6 = mem[1,1,2,2] 6321; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 6322; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 6323; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6324; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 6325; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 6326; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 6327; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6328; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 6329; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 6330; AVX2-FP-NEXT: # ymm1 = mem[1,1,2,2] 6331; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 6332; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 6333; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6334; AVX2-FP-NEXT: # ymm2 = mem[1,1,2,2] 6335; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 6336; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 6337; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 6338; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] 6339; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 6340; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6341; AVX2-FP-NEXT: # ymm6 = mem[1,1,2,2] 6342; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 6343; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6344; AVX2-FP-NEXT: # ymm6 = mem[1,1,2,2] 6345; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 6346; AVX2-FP-NEXT: vmovdqa %ymm1, 544(%r9) 6347; AVX2-FP-NEXT: vmovdqa %ymm5, 384(%r9) 6348; AVX2-FP-NEXT: vmovdqa %ymm4, 224(%r9) 6349; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%r9) 6350; AVX2-FP-NEXT: vmovdqa %ymm0, 608(%r9) 6351; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6352; AVX2-FP-NEXT: vmovaps %ymm0, 576(%r9) 6353; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6354; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r9) 6355; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6356; AVX2-FP-NEXT: vmovaps %ymm0, 416(%r9) 6357; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6358; AVX2-FP-NEXT: vmovaps %ymm0, 288(%r9) 6359; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6360; AVX2-FP-NEXT: vmovaps %ymm0, 256(%r9) 6361; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6362; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) 6363; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6364; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r9) 6365; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6366; AVX2-FP-NEXT: vmovaps %ymm0, 512(%r9) 6367; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6368; AVX2-FP-NEXT: vmovaps %ymm0, 480(%r9) 6369; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6370; AVX2-FP-NEXT: vmovaps %ymm0, 352(%r9) 6371; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6372; AVX2-FP-NEXT: vmovaps %ymm0, 320(%r9) 6373; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6374; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) 6375; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6376; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r9) 6377; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6378; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) 6379; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6380; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) 6381; AVX2-FP-NEXT: addq $936, %rsp # imm = 0x3A8 6382; AVX2-FP-NEXT: vzeroupper 6383; AVX2-FP-NEXT: retq 6384; 6385; AVX2-FCP-LABEL: store_i16_stride5_vf64: 6386; AVX2-FCP: # %bb.0: 6387; AVX2-FCP-NEXT: subq $936, %rsp # imm = 0x3A8 6388; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 6389; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 6390; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 6391; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 6392; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 6393; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13 6394; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] 6395; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 6396; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6397; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6398; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 6399; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 6400; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6401; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 6402; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 6403; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 6404; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 6405; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6406; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 6407; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 6408; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 6409; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] 6410; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 6411; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 6412; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6413; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] 6414; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6415; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6416; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 6417; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm14 6418; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] 6419; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 6420; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] 6421; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 6422; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6423; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] 6424; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 6425; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 6426; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 6427; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 6428; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 6429; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] 6430; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm0 6431; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm9 6432; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 6433; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] 6434; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 6435; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 6436; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 6437; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 6438; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm10 6439; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 6440; AVX2-FCP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 6441; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6442; AVX2-FCP-NEXT: vpbroadcastq 32(%r8), %ymm10 6443; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 6444; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6445; AVX2-FCP-NEXT: vpbroadcastq 64(%r8), %ymm10 6446; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 6447; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6448; AVX2-FCP-NEXT: vpbroadcastq 96(%r8), %ymm1 6449; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 6450; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6451; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 6452; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm0 6453; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm1 6454; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 6455; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 6456; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 6457; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] 6458; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] 6459; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6460; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] 6461; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 6462; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 6463; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 6464; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm2 6465; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 6466; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 6467; AVX2-FCP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6468; AVX2-FCP-NEXT: # xmm4 = mem[1,2,2,2] 6469; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] 6470; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 6471; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 6472; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 6473; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 6474; AVX2-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm4 6475; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] 6476; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm12 6477; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6478; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6479; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6480; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] 6481; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 6482; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm14 6483; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6484; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 6485; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] 6486; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 6487; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm6 6488; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6489; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm3 6490; AVX2-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm4 6491; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] 6492; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm8 6493; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6494; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 6495; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm10 6496; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6497; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] 6498; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] 6499; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm9 6500; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6501; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 6502; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] 6503; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 6504; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] 6505; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 6506; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 6507; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6508; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] 6509; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 6510; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6511; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] 6512; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 6513; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6514; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] 6515; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 6516; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6517; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 6518; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 6519; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 6520; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] 6521; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 6522; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 6523; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6524; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 6525; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] 6526; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] 6527; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 6528; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6529; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] 6530; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] 6531; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6532; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6533; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] 6534; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 6535; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 6536; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6537; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 6538; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6539; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6540; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] 6541; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 6542; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 6543; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6544; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 6545; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 6546; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] 6547; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] 6548; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6549; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6550; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 6551; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm4 6552; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6553; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 6554; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6555; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6556; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] 6557; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 6558; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 6559; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill 6560; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 6561; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 6562; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6563; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] 6564; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] 6565; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 6566; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 6567; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 6568; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 6569; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6570; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 6571; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm7 6572; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] 6573; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] 6574; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 6575; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 6576; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6577; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 6578; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] 6579; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6580; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] 6581; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 6582; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 6583; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 6584; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm13 6585; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 6586; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 6587; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6588; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm13 6589; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 6590; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6591; AVX2-FCP-NEXT: vpbroadcastq 80(%r8), %ymm10 6592; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 6593; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6594; AVX2-FCP-NEXT: vpbroadcastq 112(%r8), %ymm10 6595; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 6596; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6597; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 6598; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] 6599; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm10 6600; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 6601; AVX2-FCP-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] 6602; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] 6603; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 6604; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6605; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 6606; AVX2-FCP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 6607; AVX2-FCP-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] 6608; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] 6609; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] 6610; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] 6611; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] 6612; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 6613; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 6614; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6615; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] 6616; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 6617; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6618; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm15 6619; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6620; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] 6621; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] 6622; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 6623; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 6624; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 6625; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 6626; AVX2-FCP-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload 6627; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] 6628; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 6629; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6630; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm15 6631; AVX2-FCP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload 6632; AVX2-FCP-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] 6633; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] 6634; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 6635; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 6636; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 6637; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 6638; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] 6639; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] 6640; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6641; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm12 6642; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] 6643; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm3 6644; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] 6645; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 6646; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] 6647; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 6648; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm12 6649; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 6650; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 6651; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6652; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm10 6653; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 6654; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6655; AVX2-FCP-NEXT: vpbroadcastq 88(%r8), %ymm10 6656; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 6657; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6658; AVX2-FCP-NEXT: vpbroadcastq 120(%r8), %ymm9 6659; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 6660; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 6661; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6662; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm11 6663; AVX2-FCP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload 6664; AVX2-FCP-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] 6665; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] 6666; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 6667; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6668; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 6669; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload 6670; AVX2-FCP-NEXT: # ymm13 = mem[1,1,2,2] 6671; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] 6672; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] 6673; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 6674; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 6675; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] 6676; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] 6677; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm13 6678; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] 6679; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] 6680; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 6681; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 6682; AVX2-FCP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload 6683; AVX2-FCP-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] 6684; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] 6685; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6686; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm5 6687; AVX2-FCP-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload 6688; AVX2-FCP-NEXT: # ymm6 = mem[1,1,2,2] 6689; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 6690; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 6691; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6692; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 6693; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 6694; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 6695; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6696; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 6697; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 6698; AVX2-FCP-NEXT: # ymm1 = mem[1,1,2,2] 6699; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 6700; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 6701; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 6702; AVX2-FCP-NEXT: # ymm2 = mem[1,1,2,2] 6703; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 6704; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 6705; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 6706; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] 6707; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 6708; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6709; AVX2-FCP-NEXT: # ymm6 = mem[1,1,2,2] 6710; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 6711; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 6712; AVX2-FCP-NEXT: # ymm6 = mem[1,1,2,2] 6713; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 6714; AVX2-FCP-NEXT: vmovdqa %ymm1, 544(%r9) 6715; AVX2-FCP-NEXT: vmovdqa %ymm5, 384(%r9) 6716; AVX2-FCP-NEXT: vmovdqa %ymm4, 224(%r9) 6717; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%r9) 6718; AVX2-FCP-NEXT: vmovdqa %ymm0, 608(%r9) 6719; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6720; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%r9) 6721; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6722; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r9) 6723; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6724; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r9) 6725; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6726; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%r9) 6727; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6728; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%r9) 6729; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6730; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) 6731; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6732; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r9) 6733; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6734; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%r9) 6735; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6736; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r9) 6737; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6738; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%r9) 6739; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6740; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%r9) 6741; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6742; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) 6743; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6744; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r9) 6745; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6746; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) 6747; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6748; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) 6749; AVX2-FCP-NEXT: addq $936, %rsp # imm = 0x3A8 6750; AVX2-FCP-NEXT: vzeroupper 6751; AVX2-FCP-NEXT: retq 6752; 6753; AVX512-LABEL: store_i16_stride5_vf64: 6754; AVX512: # %bb.0: 6755; AVX512-NEXT: subq $488, %rsp # imm = 0x1E8 6756; AVX512-NEXT: vmovdqa 96(%rcx), %ymm11 6757; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 6758; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm0 6759; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm17 6760; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] 6761; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 6762; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0 6763; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 6764; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm3 6765; AVX512-NEXT: vmovdqa 96(%rdx), %xmm2 6766; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] 6767; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 6768; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] 6769; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 6770; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6771; AVX512-NEXT: vmovdqa 96(%rsi), %ymm9 6772; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 6773; AVX512-NEXT: vpshufb %ymm3, %ymm9, %ymm1 6774; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20 6775; AVX512-NEXT: vmovdqa 96(%rdi), %ymm4 6776; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,1,2,2] 6777; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 6778; AVX512-NEXT: vmovdqa 96(%rsi), %xmm1 6779; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 6780; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6 6781; AVX512-NEXT: vpbroadcastq 104(%rdi), %xmm8 6782; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5],xmm8[6],xmm6[7] 6783; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 6784; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 6785; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6786; AVX512-NEXT: vmovdqa 96(%r8), %ymm3 6787; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,zero,zero 6788; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 6789; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] 6790; AVX512-NEXT: vpandn %ymm3, %ymm15, %ymm3 6791; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 6792; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6793; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6794; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 6795; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0 6796; AVX512-NEXT: vmovdqa64 (%rdx), %ymm16 6797; AVX512-NEXT: vmovdqa 64(%rdx), %ymm2 6798; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6799; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 6800; AVX512-NEXT: vmovdqa 64(%rcx), %ymm3 6801; AVX512-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 6802; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 6803; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 6804; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 6805; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 6806; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] 6807; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6808; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0 6809; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6810; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 6811; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 6812; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm18 6813; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,3,2,3,6,7,6,7] 6814; AVX512-NEXT: vmovdqa 64(%rsi), %ymm2 6815; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6816; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 6817; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] 6818; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 6819; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 6820; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[0,1,0,1] 6821; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 6822; AVX512-NEXT: vmovdqa 32(%rcx), %xmm6 6823; AVX512-NEXT: vmovdqa 64(%rcx), %xmm1 6824; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6825; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1 6826; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] 6827; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 6828; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6829; AVX512-NEXT: vmovdqa 64(%rsi), %xmm0 6830; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm1 6831; AVX512-NEXT: vpbroadcastq 72(%rdi), %xmm2 6832; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 6833; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 6834; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 6835; AVX512-NEXT: vmovdqa (%rsi), %xmm0 6836; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 6837; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 6838; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 6839; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6840; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm1 6841; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm2 6842; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 6843; AVX512-NEXT: vmovdqa (%rdi), %xmm2 6844; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 6845; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,2,3,3,7,6,7,7] 6846; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 6847; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 6848; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 6849; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6850; AVX512-NEXT: vmovdqa (%rcx), %ymm2 6851; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 6852; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 6853; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3,4],ymm0[5,6,7,8],ymm8[9],ymm0[10],ymm8[11,12],ymm0[13,14,15] 6854; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 6855; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] 6856; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 6857; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1 6858; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[0,1,0,1] 6859; AVX512-NEXT: vmovdqa64 (%rdi), %ymm30 6860; AVX512-NEXT: vmovdqa (%rsi), %ymm10 6861; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 6862; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 6863; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[2,3,2,3,6,7,6,7] 6864; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 6865; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 6866; AVX512-NEXT: vmovdqa 32(%rsi), %xmm13 6867; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 6868; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] 6869; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 6870; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] 6871; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm0[0,1,2,3],zmm1[0,1,0,1] 6872; AVX512-NEXT: vmovdqa64 32(%rdx), %ymm19 6873; AVX512-NEXT: vmovdqa 32(%rcx), %ymm1 6874; AVX512-NEXT: vpshufb %ymm14, %ymm1, %ymm0 6875; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,0,3,0,7,4,7,4] 6876; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8],ymm0[9],ymm14[10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] 6877; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 6878; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] 6879; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6],xmm6[7] 6880; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] 6881; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 6882; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6883; AVX512-NEXT: vmovdqa 32(%rdi), %ymm6 6884; AVX512-NEXT: vmovdqa 32(%rsi), %ymm0 6885; AVX512-NEXT: vmovdqa64 %ymm20, %ymm8 6886; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm8 6887; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm6[1,1,2,2] 6888; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15] 6889; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 6890; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm13 6891; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] 6892; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 6893; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm24 6894; AVX512-NEXT: vmovdqa 32(%r8), %ymm5 6895; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,1,1,1] 6896; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 6897; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm5 6898; AVX512-NEXT: vpandn %ymm8, %ymm15, %ymm8 6899; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm26 6900; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm3 6901; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6902; AVX512-NEXT: vmovdqa (%rcx), %xmm3 6903; AVX512-NEXT: vmovdqa (%rdx), %xmm5 6904; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 6905; AVX512-NEXT: vpshufb %xmm12, %xmm13, %xmm8 6906; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 6907; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3 6908; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] 6909; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] 6910; AVX512-NEXT: vmovdqa64 %ymm3, %ymm28 6911; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] 6912; AVX512-NEXT: vprolq $16, %ymm9, %ymm5 6913; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] 6914; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 6915; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 6916; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] 6917; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 6918; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 6919; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 6920; AVX512-NEXT: # ymm7 = mem[0,1,0,1] 6921; AVX512-NEXT: vpshufb %ymm7, %ymm11, %ymm3 6922; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,2,5,5,5,6] 6923; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] 6924; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 6925; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 6926; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,2,3,3,7,6,7,7] 6927; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2],ymm5[3,4],ymm3[5,6,7,8],ymm5[9],ymm3[10],ymm5[11,12],ymm3[13,14,15] 6928; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,2,1,4,5,6,5] 6929; AVX512-NEXT: vprolq $16, %ymm0, %ymm5 6930; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] 6931; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 6932; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 6933; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,3,2,3,6,7,6,7] 6934; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] 6935; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21 6936; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm0 6937; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] 6938; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13],ymm3[14],ymm0[15] 6939; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 6940; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 6941; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,2,3,3,7,6,7,7] 6942; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] 6943; AVX512-NEXT: vmovdqa64 %ymm20, %ymm5 6944; AVX512-NEXT: vpshufb %ymm5, %ymm10, %ymm0 6945; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm30[1,1,2,2] 6946; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 6947; AVX512-NEXT: vprolq $16, %ymm10, %ymm1 6948; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,1,4,5,6,5] 6949; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8,9],ymm3[10],ymm1[11],ymm3[12],ymm1[13,14],ymm3[15] 6950; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 6951; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 6952; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 6953; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm0 6954; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,0,3,0,7,4,7,4] 6955; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 6956; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm1 6957; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[1,1,1,2,5,5,5,6] 6958; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 6959; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 6960; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 6961; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 6962; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm16, %ymm0 6963; AVX512-NEXT: vmovdqa (%r8), %ymm10 6964; AVX512-NEXT: vpshufb %ymm14, %ymm10, %ymm1 6965; AVX512-NEXT: vmovdqa64 %ymm14, %ymm23 6966; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 6967; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6968; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm0 6969; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm18[1,1,2,2] 6970; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 6971; AVX512-NEXT: vprolq $16, %ymm2, %ymm1 6972; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,1,4,5,6,5] 6973; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] 6974; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 6975; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 6976; AVX512-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 6977; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm0 6978; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6979; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,0,3,0,7,4,7,4] 6980; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 6981; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm4 6982; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 6983; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload 6984; AVX512-NEXT: # zmm31 = mem ^ (zmm19 & (zmm31 ^ mem)) 6985; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm1 6986; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm2 6987; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 6988; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 6989; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm18 & (zmm1 ^ zmm31)) 6990; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm25 ^ (zmm19 & (zmm27 ^ zmm25)) 6991; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm2 6992; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm25 6993; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 6994; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm18 & (zmm2 ^ zmm27)) 6995; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] 6996; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 6997; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload 6998; AVX512-NEXT: # zmm31 = mem ^ (zmm18 & (zmm31 ^ mem)) 6999; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload 7000; AVX512-NEXT: # zmm24 = mem ^ (zmm18 & (zmm24 ^ mem)) 7001; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload 7002; AVX512-NEXT: # ymm18 = mem[0,1,0,1] 7003; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload 7004; AVX512-NEXT: # ymm25 = mem[0,1,0,0] 7005; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm29[0,1,0,1] 7006; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,0] 7007; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] 7008; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm22[2,3,2,3] 7009; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 7010; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 7011; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 7012; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] 7013; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm21[2,3,2,2] 7014; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 7015; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,2,3,2] 7016; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] 7017; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5],ymm8[6],ymm4[7,8],ymm8[9],ymm4[10,11],ymm8[12],ymm4[13],ymm8[14],ymm4[15] 7018; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 7019; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 7020; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 7021; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] 7022; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 7023; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7024; AVX512-NEXT: vpandnq 80(%r8){1to4}, %ymm16, %ymm29 7025; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 7026; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7027; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 7028; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 | (zmm31 & zmm21) 7029; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm24 & zmm21) 7030; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 7031; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload 7032; AVX512-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] 7033; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 7034; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm18 ^ (zmm24 & (zmm21 ^ zmm18)) 7035; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm18 7036; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 7037; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 7038; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm18 & (zmm8 ^ zmm21)) 7039; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 7040; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload 7041; AVX512-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] 7042; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm21 ^ (zmm24 & (zmm22 ^ zmm21)) 7043; AVX512-NEXT: vpbroadcastq (%r8), %ymm21 7044; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 7045; AVX512-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (zmm18 & (zmm10 ^ zmm22)) 7046; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 7047; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 7048; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm13 ^ (zmm24 & (zmm11 ^ zmm13)) 7049; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm12 7050; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm13 7051; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 7052; AVX512-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm16 & (zmm12 ^ zmm11)) 7053; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 7054; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 7055; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm7 ^ (zmm24 & (zmm5 ^ zmm7)) 7056; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm6 7057; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm7 7058; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 7059; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm16 & (zmm6 ^ zmm5)) 7060; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm30 ^ (zmm19 & (zmm9 ^ zmm30)) 7061; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm19 & (zmm0 ^ zmm20)) 7062; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 7063; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm9 & zmm5) 7064; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm0 & zmm5) 7065; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r9) 7066; AVX512-NEXT: vmovdqa64 %zmm17, 64(%r9) 7067; AVX512-NEXT: vmovdqa64 %zmm6, 256(%r9) 7068; AVX512-NEXT: vmovdqa64 %zmm12, 576(%r9) 7069; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) 7070; AVX512-NEXT: vmovdqa64 %zmm26, 192(%r9) 7071; AVX512-NEXT: vmovdqa64 %zmm2, 128(%r9) 7072; AVX512-NEXT: vmovdqa64 %zmm8, 320(%r9) 7073; AVX512-NEXT: vmovdqa64 %zmm1, 448(%r9) 7074; AVX512-NEXT: vmovdqa64 %zmm23, 512(%r9) 7075; AVX512-NEXT: addq $488, %rsp # imm = 0x1E8 7076; AVX512-NEXT: vzeroupper 7077; AVX512-NEXT: retq 7078; 7079; AVX512-FCP-LABEL: store_i16_stride5_vf64: 7080; AVX512-FCP: # %bb.0: 7081; AVX512-FCP-NEXT: subq $360, %rsp # imm = 0x168 7082; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm8 7083; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 7084; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm0 7085; AVX512-FCP-NEXT: vmovdqa64 96(%rdx), %ymm19 7086; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,0,3,0,7,4,7,4] 7087; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7088; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 7089; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7090; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 7091; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7092; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 7093; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7094; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] 7095; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] 7096; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 7097; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 7098; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7099; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 7100; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 7101; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm1 7102; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 7103; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm23[1,1,2,2] 7104; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 7105; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 7106; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7107; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 7108; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 7109; AVX512-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm5 7110; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 7111; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7112; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 7113; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7114; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm2 7115; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 7116; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm4 7117; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7118; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] 7119; AVX512-FCP-NEXT: vpandn %ymm2, %ymm5, %ymm2 7120; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 7121; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 7122; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7123; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 7124; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 7125; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 7126; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %ymm29 7127; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[3,0,3,0,7,4,7,4] 7128; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] 7129; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 7130; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7131; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 7132; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 7133; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7134; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] 7135; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] 7136; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] 7137; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 7138; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7139; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 7140; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4 7141; AVX512-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm5 7142; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 7143; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 7144; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 7145; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 7146; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 7147; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 7148; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 7149; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5 7150; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm6 7151; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] 7152; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm6 7153; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 7154; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 7155; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 7156; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 7157; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm5 7158; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm30 7159; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm30[1,1,2,2] 7160; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 7161; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm6 7162; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7163; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 7164; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm6 7165; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7] 7166; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7167; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm16 7168; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm1 7169; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] 7170; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 7171; AVX512-FCP-NEXT: vpandn %ymm5, %ymm7, %ymm5 7172; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm17 7173; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 7174; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm5 7175; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 7176; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2] 7177; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] 7178; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] 7179; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 7180; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7181; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] 7182; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 7183; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] 7184; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 7185; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm1 ^ (zmm31 & (zmm20 ^ zmm1)) 7186; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 7187; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 7188; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 7189; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,2,2,2] 7190; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] 7191; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 7192; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7193; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 7194; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] 7195; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm1 ^ (zmm31 & (zmm21 ^ zmm1)) 7196; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 7197; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7198; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 7199; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 7200; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm28[1,1,2,2] 7201; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7202; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] 7203; AVX512-FCP-NEXT: vprolq $16, %ymm2, %ymm2 7204; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 7205; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7206; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7207; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7208; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 7209; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 7210; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm0 7211; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,0,3,0,7,4,7,4] 7212; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7213; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 7214; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] 7215; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 7216; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] 7217; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 7218; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7219; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 7220; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 7221; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm12, %ymm0 7222; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 7223; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm1 7224; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 7225; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 7226; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 7227; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 7228; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm0 7229; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,3,2,3,6,7,6,7] 7230; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 7231; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm1 7232; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] 7233; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] 7234; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] 7235; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 7236; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 7237; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,2,3,3,7,6,7,7] 7238; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] 7239; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2 7240; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] 7241; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 7242; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] 7243; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 7244; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm31 & (zmm2 ^ zmm1)) 7245; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 7246; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 7247; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 7248; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm12 & (zmm14 ^ zmm2)) 7249; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 7250; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 7251; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 7252; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm23[1,1,2,2] 7253; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7254; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] 7255; AVX512-FCP-NEXT: vprolq $16, %ymm8, %ymm2 7256; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 7257; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7258; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 7259; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm1 7260; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm2 7261; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm3 7262; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] 7263; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10],ymm2[11],ymm9[12,13],ymm2[14],ymm9[15] 7264; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 7265; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,2,5,5,5,6] 7266; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] 7267; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 7268; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm9 7269; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm5 7270; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 7271; AVX512-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm12, %ymm24 7272; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm13, %zmm24 7273; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm13 7274; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[2,3,2,3,6,7,6,7] 7275; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 7276; AVX512-FCP-NEXT: vprolq $16, %ymm11, %ymm11 7277; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[0,1,2,1,4,5,6,5] 7278; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] 7279; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm11 7280; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 7281; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 7282; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm13 7283; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[3,2,3,3,7,6,7,7] 7284; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] 7285; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 7286; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] 7287; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] 7288; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 7289; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm11 ^ (zmm31 & (zmm15 ^ zmm11)) 7290; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 7291; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 7292; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 7293; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm12 & (zmm7 ^ zmm15)) 7294; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7295; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 7296; AVX512-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] 7297; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 7298; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,2,3,3,7,6,7,7] 7299; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] 7300; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 7301; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 7302; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] 7303; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 7304; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 7305; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7306; AVX512-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 7307; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm15 7308; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 7309; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[2,3,2,3,6,7,6,7] 7310; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10],ymm12[11],ymm8[12,13],ymm12[14],ymm8[15] 7311; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 7312; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 7313; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] 7314; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 7315; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 7316; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm1 & (zmm8 ^ zmm0)) 7317; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7318; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7319; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 7320; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 7321; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm4 7322; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] 7323; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,2,3,3,7,6,7,7] 7324; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] 7325; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 7326; AVX512-FCP-NEXT: vpbroadcastq 88(%r8), %ymm0 7327; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 7328; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 7329; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 7330; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm8)) 7331; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 7332; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 7333; AVX512-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] 7334; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 7335; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7336; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm6 7337; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm26[0,1,1,1] 7338; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] 7339; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] 7340; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 7341; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm1 & (zmm6 ^ zmm4)) 7342; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 7343; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 7344; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 7345; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm6)) 7346; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] 7347; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 7348; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload 7349; AVX512-FCP-NEXT: # zmm3 = mem ^ (zmm5 & (zmm3 ^ mem)) 7350; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload 7351; AVX512-FCP-NEXT: # zmm16 = mem ^ (zmm5 & (zmm16 ^ mem)) 7352; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 7353; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7354; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm3 & zmm5) 7355; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm16 & zmm5) 7356; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 7357; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 7358; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 7359; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm20)) 7360; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 7361; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 7362; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm21)) 7363; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload 7364; AVX512-FCP-NEXT: # zmm25 = mem ^ (zmm1 & (zmm25 ^ mem)) 7365; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm19 ^ (zmm1 & (zmm9 ^ zmm19)) 7366; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 7367; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 | (zmm25 & zmm1) 7368; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm9 & zmm1) 7369; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) 7370; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) 7371; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) 7372; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%r9) 7373; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 576(%r9) 7374; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9) 7375; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) 7376; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) 7377; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) 7378; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 512(%r9) 7379; AVX512-FCP-NEXT: addq $360, %rsp # imm = 0x168 7380; AVX512-FCP-NEXT: vzeroupper 7381; AVX512-FCP-NEXT: retq 7382; 7383; AVX512DQ-LABEL: store_i16_stride5_vf64: 7384; AVX512DQ: # %bb.0: 7385; AVX512DQ-NEXT: subq $488, %rsp # imm = 0x1E8 7386; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm11 7387; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 7388; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm0 7389; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm17 7390; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] 7391; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7392; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0 7393; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 7394; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm3 7395; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm2 7396; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] 7397; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 7398; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] 7399; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 7400; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7401; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm9 7402; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 7403; AVX512DQ-NEXT: vpshufb %ymm3, %ymm9, %ymm1 7404; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 7405; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm4 7406; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,1,2,2] 7407; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 7408; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm1 7409; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 7410; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm6 7411; AVX512DQ-NEXT: vpbroadcastq 104(%rdi), %xmm8 7412; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5],xmm8[6],xmm6[7] 7413; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 7414; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 7415; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7416; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm3 7417; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,zero,zero 7418; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7419; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] 7420; AVX512DQ-NEXT: vpandn %ymm3, %ymm15, %ymm3 7421; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 7422; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7423; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7424; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 7425; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0 7426; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm16 7427; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm2 7428; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7429; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] 7430; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm3 7431; AVX512DQ-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 7432; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 7433; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 7434; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] 7435; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] 7436; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] 7437; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7438; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0 7439; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 7440; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 7441; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 7442; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm18 7443; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,3,2,3,6,7,6,7] 7444; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm2 7445; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7446; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 7447; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] 7448; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 7449; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 7450; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[0,1,0,1] 7451; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 7452; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm6 7453; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm1 7454; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7455; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7456; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] 7457; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 7458; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7459; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm0 7460; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm1 7461; AVX512DQ-NEXT: vpbroadcastq 72(%rdi), %xmm2 7462; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 7463; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 7464; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7465; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 7466; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 7467; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 7468; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 7469; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7470; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm1 7471; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm2 7472; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 7473; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 7474; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7475; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,2,3,3,7,6,7,7] 7476; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 7477; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 7478; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 7479; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7480; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 7481; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 7482; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7483; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3,4],ymm0[5,6,7,8],ymm8[9],ymm0[10],ymm8[11,12],ymm0[13,14,15] 7484; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 7485; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] 7486; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 7487; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1 7488; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[0,1,0,1] 7489; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm30 7490; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm10 7491; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 7492; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 7493; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[2,3,2,3,6,7,6,7] 7494; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 7495; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] 7496; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm13 7497; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 7498; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] 7499; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 7500; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] 7501; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm0[0,1,2,3],zmm1[0,1,0,1] 7502; AVX512DQ-NEXT: vmovdqa64 32(%rdx), %ymm19 7503; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm1 7504; AVX512DQ-NEXT: vpshufb %ymm14, %ymm1, %ymm0 7505; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,0,3,0,7,4,7,4] 7506; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8],ymm0[9],ymm14[10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] 7507; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 7508; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] 7509; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6],xmm6[7] 7510; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] 7511; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 7512; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7513; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6 7514; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm0 7515; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm8 7516; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm8 7517; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm6[1,1,2,2] 7518; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15] 7519; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 7520; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm13 7521; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] 7522; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 7523; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm24 7524; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm5 7525; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,1,1,1] 7526; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 7527; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm5 7528; AVX512DQ-NEXT: vpandn %ymm8, %ymm15, %ymm8 7529; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm26 7530; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm3 7531; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7532; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 7533; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 7534; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 7535; AVX512DQ-NEXT: vpshufb %xmm12, %xmm13, %xmm8 7536; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm29 7537; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm3 7538; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] 7539; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] 7540; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm28 7541; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] 7542; AVX512DQ-NEXT: vprolq $16, %ymm9, %ymm5 7543; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] 7544; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 7545; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 7546; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] 7547; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] 7548; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 7549; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 7550; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] 7551; AVX512DQ-NEXT: vpshufb %ymm7, %ymm11, %ymm3 7552; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,2,5,5,5,6] 7553; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] 7554; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 7555; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] 7556; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,2,3,3,7,6,7,7] 7557; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2],ymm5[3,4],ymm3[5,6,7,8],ymm5[9],ymm3[10],ymm5[11,12],ymm3[13,14,15] 7558; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,2,1,4,5,6,5] 7559; AVX512DQ-NEXT: vprolq $16, %ymm0, %ymm5 7560; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] 7561; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] 7562; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] 7563; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,3,2,3,6,7,6,7] 7564; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] 7565; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21 7566; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm0 7567; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] 7568; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13],ymm3[14],ymm0[15] 7569; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 7570; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] 7571; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,2,3,3,7,6,7,7] 7572; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] 7573; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm5 7574; AVX512DQ-NEXT: vpshufb %ymm5, %ymm10, %ymm0 7575; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm30[1,1,2,2] 7576; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7577; AVX512DQ-NEXT: vprolq $16, %ymm10, %ymm1 7578; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,1,4,5,6,5] 7579; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8,9],ymm3[10],ymm1[11],ymm3[12],ymm1[13,14],ymm3[15] 7580; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7581; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 7582; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 7583; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm0 7584; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,0,3,0,7,4,7,4] 7585; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7586; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm1 7587; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[1,1,1,2,5,5,5,6] 7588; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 7589; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7590; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 7591; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 7592; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm16, %ymm0 7593; AVX512DQ-NEXT: vmovdqa (%r8), %ymm10 7594; AVX512DQ-NEXT: vpshufb %ymm14, %ymm10, %ymm1 7595; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm23 7596; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 7597; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7598; AVX512DQ-NEXT: vpshufb %ymm5, %ymm2, %ymm0 7599; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm18[1,1,2,2] 7600; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7601; AVX512DQ-NEXT: vprolq $16, %ymm2, %ymm1 7602; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,1,4,5,6,5] 7603; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] 7604; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7605; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 7606; AVX512DQ-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 7607; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm0 7608; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7609; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,0,3,0,7,4,7,4] 7610; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7611; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm4 7612; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 7613; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload 7614; AVX512DQ-NEXT: # zmm31 = mem ^ (zmm19 & (zmm31 ^ mem)) 7615; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm1 7616; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm2 7617; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 7618; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 7619; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm18 & (zmm1 ^ zmm31)) 7620; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm25 ^ (zmm19 & (zmm27 ^ zmm25)) 7621; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm2 7622; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm25 7623; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 7624; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm18 & (zmm2 ^ zmm27)) 7625; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] 7626; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 7627; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload 7628; AVX512DQ-NEXT: # zmm31 = mem ^ (zmm18 & (zmm31 ^ mem)) 7629; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload 7630; AVX512DQ-NEXT: # zmm24 = mem ^ (zmm18 & (zmm24 ^ mem)) 7631; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload 7632; AVX512DQ-NEXT: # ymm18 = mem[0,1,0,1] 7633; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload 7634; AVX512DQ-NEXT: # ymm25 = mem[0,1,0,0] 7635; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm29[0,1,0,1] 7636; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,0] 7637; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] 7638; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm22[2,3,2,3] 7639; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] 7640; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 7641; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] 7642; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] 7643; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm21[2,3,2,2] 7644; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 7645; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,2,3,2] 7646; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] 7647; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5],ymm8[6],ymm4[7,8],ymm8[9],ymm4[10,11],ymm8[12],ymm4[13],ymm8[14],ymm4[15] 7648; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 7649; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 7650; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 7651; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] 7652; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm3 7653; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7654; AVX512DQ-NEXT: vpandnq 80(%r8){1to4}, %ymm16, %ymm29 7655; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 7656; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7657; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload 7658; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 | (zmm31 & zmm21) 7659; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm24 & zmm21) 7660; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 7661; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload 7662; AVX512DQ-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] 7663; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 7664; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm18 ^ (zmm24 & (zmm21 ^ zmm18)) 7665; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm18 7666; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 7667; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 7668; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm18 & (zmm8 ^ zmm21)) 7669; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 7670; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload 7671; AVX512DQ-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] 7672; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm21 ^ (zmm24 & (zmm22 ^ zmm21)) 7673; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm21 7674; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 7675; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (zmm18 & (zmm10 ^ zmm22)) 7676; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 7677; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 7678; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm13 ^ (zmm24 & (zmm11 ^ zmm13)) 7679; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm12 7680; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm13 7681; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 7682; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm16 & (zmm12 ^ zmm11)) 7683; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 7684; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 7685; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm7 ^ (zmm24 & (zmm5 ^ zmm7)) 7686; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm6 7687; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm7 7688; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 7689; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm16 & (zmm6 ^ zmm5)) 7690; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm30 ^ (zmm19 & (zmm9 ^ zmm30)) 7691; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm19 & (zmm0 ^ zmm20)) 7692; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 7693; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm9 & zmm5) 7694; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm0 & zmm5) 7695; AVX512DQ-NEXT: vmovdqa64 %zmm4, 384(%r9) 7696; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%r9) 7697; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%r9) 7698; AVX512DQ-NEXT: vmovdqa64 %zmm12, 576(%r9) 7699; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r9) 7700; AVX512DQ-NEXT: vmovdqa64 %zmm26, 192(%r9) 7701; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%r9) 7702; AVX512DQ-NEXT: vmovdqa64 %zmm8, 320(%r9) 7703; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%r9) 7704; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%r9) 7705; AVX512DQ-NEXT: addq $488, %rsp # imm = 0x1E8 7706; AVX512DQ-NEXT: vzeroupper 7707; AVX512DQ-NEXT: retq 7708; 7709; AVX512DQ-FCP-LABEL: store_i16_stride5_vf64: 7710; AVX512DQ-FCP: # %bb.0: 7711; AVX512DQ-FCP-NEXT: subq $360, %rsp # imm = 0x168 7712; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm8 7713; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 7714; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm0 7715; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdx), %ymm19 7716; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,0,3,0,7,4,7,4] 7717; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7718; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 7719; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7720; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] 7721; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7722; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 7723; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7724; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] 7725; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] 7726; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] 7727; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 7728; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7729; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 7730; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] 7731; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm1 7732; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 7733; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm23[1,1,2,2] 7734; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 7735; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 7736; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7737; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] 7738; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 7739; AVX512DQ-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm5 7740; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 7741; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7742; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 7743; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7744; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm2 7745; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] 7746; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm4 7747; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7748; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] 7749; AVX512DQ-FCP-NEXT: vpandn %ymm2, %ymm5, %ymm2 7750; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 7751; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 7752; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7753; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 7754; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 7755; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 7756; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %ymm29 7757; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[3,0,3,0,7,4,7,4] 7758; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] 7759; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 7760; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7761; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 7762; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 7763; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7764; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] 7765; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] 7766; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] 7767; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 7768; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7769; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 7770; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4 7771; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm5 7772; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 7773; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 7774; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 7775; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 7776; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 7777; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 7778; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 7779; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5 7780; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm6 7781; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] 7782; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm6 7783; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 7784; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 7785; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 7786; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 7787; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm5 7788; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm30 7789; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm30[1,1,2,2] 7790; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 7791; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm6 7792; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7793; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 7794; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm6 7795; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7] 7796; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7797; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm16 7798; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm1 7799; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] 7800; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 7801; AVX512DQ-FCP-NEXT: vpandn %ymm5, %ymm7, %ymm5 7802; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm17 7803; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 7804; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm5 7805; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 7806; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2] 7807; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] 7808; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] 7809; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 7810; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7811; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] 7812; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 7813; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] 7814; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] 7815; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm1 ^ (zmm31 & (zmm20 ^ zmm1)) 7816; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 7817; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 7818; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 7819; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,2,2,2] 7820; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] 7821; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 7822; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7823; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 7824; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] 7825; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm1 ^ (zmm31 & (zmm21 ^ zmm1)) 7826; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 7827; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7828; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 7829; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 7830; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm28[1,1,2,2] 7831; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7832; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] 7833; AVX512DQ-FCP-NEXT: vprolq $16, %ymm2, %ymm2 7834; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 7835; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7836; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7837; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7838; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 7839; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 7840; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm0 7841; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,0,3,0,7,4,7,4] 7842; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 7843; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] 7844; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] 7845; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 7846; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] 7847; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 7848; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7849; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 7850; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] 7851; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm12, %ymm0 7852; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 7853; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm1 7854; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 7855; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 7856; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] 7857; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 7858; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm0 7859; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,3,2,3,6,7,6,7] 7860; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 7861; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm1 7862; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] 7863; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] 7864; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] 7865; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 7866; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 7867; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,2,3,3,7,6,7,7] 7868; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] 7869; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2 7870; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] 7871; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 7872; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] 7873; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 7874; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm31 & (zmm2 ^ zmm1)) 7875; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 7876; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 7877; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 7878; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm12 & (zmm14 ^ zmm2)) 7879; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 7880; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 7881; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 7882; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm23[1,1,2,2] 7883; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7884; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] 7885; AVX512DQ-FCP-NEXT: vprolq $16, %ymm8, %ymm2 7886; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 7887; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7888; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 7889; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm1 7890; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm2 7891; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm3 7892; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] 7893; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10],ymm2[11],ymm9[12,13],ymm2[14],ymm9[15] 7894; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 7895; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,2,5,5,5,6] 7896; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] 7897; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 7898; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm9 7899; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm5 7900; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 7901; AVX512DQ-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm12, %ymm24 7902; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm13, %zmm24 7903; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm13 7904; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[2,3,2,3,6,7,6,7] 7905; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 7906; AVX512DQ-FCP-NEXT: vprolq $16, %ymm11, %ymm11 7907; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[0,1,2,1,4,5,6,5] 7908; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] 7909; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm11 7910; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 7911; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] 7912; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm13 7913; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[3,2,3,3,7,6,7,7] 7914; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] 7915; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 7916; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] 7917; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] 7918; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 7919; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm11 ^ (zmm31 & (zmm15 ^ zmm11)) 7920; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 7921; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 7922; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 7923; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm12 & (zmm7 ^ zmm15)) 7924; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7925; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 7926; AVX512DQ-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] 7927; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 7928; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,2,3,3,7,6,7,7] 7929; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] 7930; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 7931; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 7932; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] 7933; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 7934; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 7935; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7936; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 7937; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm15 7938; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 7939; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[2,3,2,3,6,7,6,7] 7940; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10],ymm12[11],ymm8[12,13],ymm12[14],ymm8[15] 7941; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 7942; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 7943; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] 7944; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 7945; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] 7946; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm1 & (zmm8 ^ zmm0)) 7947; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7948; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 7949; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 7950; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 7951; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm4 7952; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] 7953; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,2,3,3,7,6,7,7] 7954; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] 7955; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 7956; AVX512DQ-FCP-NEXT: vpbroadcastq 88(%r8), %ymm0 7957; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 7958; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 7959; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] 7960; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm8)) 7961; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 7962; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload 7963; AVX512DQ-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] 7964; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 7965; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7966; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm6 7967; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm26[0,1,1,1] 7968; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] 7969; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] 7970; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 7971; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm1 & (zmm6 ^ zmm4)) 7972; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 7973; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 7974; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 7975; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm6)) 7976; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] 7977; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 7978; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload 7979; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm5 & (zmm3 ^ mem)) 7980; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload 7981; AVX512DQ-FCP-NEXT: # zmm16 = mem ^ (zmm5 & (zmm16 ^ mem)) 7982; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 7983; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] 7984; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm3 & zmm5) 7985; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm16 & zmm5) 7986; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 7987; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 7988; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] 7989; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm20)) 7990; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 7991; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 7992; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm21)) 7993; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload 7994; AVX512DQ-FCP-NEXT: # zmm25 = mem ^ (zmm1 & (zmm25 ^ mem)) 7995; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm19 ^ (zmm1 & (zmm9 ^ zmm19)) 7996; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] 7997; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 | (zmm25 & zmm1) 7998; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm9 & zmm1) 7999; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) 8000; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) 8001; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) 8002; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%r9) 8003; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 576(%r9) 8004; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9) 8005; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) 8006; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) 8007; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) 8008; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 512(%r9) 8009; AVX512DQ-FCP-NEXT: addq $360, %rsp # imm = 0x168 8010; AVX512DQ-FCP-NEXT: vzeroupper 8011; AVX512DQ-FCP-NEXT: retq 8012; 8013; AVX512BW-LABEL: store_i16_stride5_vf64: 8014; AVX512BW: # %bb.0: 8015; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 8016; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 8017; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 8018; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 8019; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 8020; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 8021; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 8022; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 8023; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 8024; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 8025; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 8026; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 8027; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 8028; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 8029; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 8030; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 8031; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 8032; AVX512BW-NEXT: kmovd %eax, %k1 8033; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} 8034; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 8035; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 8036; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 8037; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 8038; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 8039; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 8040; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 8041; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 8042; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 8043; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 8044; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 8045; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 8046; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 8047; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 8048; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 8049; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 8050; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 8051; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 8052; AVX512BW-NEXT: kmovd %eax, %k3 8053; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} 8054; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 8055; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 8056; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 8057; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 8058; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 8059; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 8060; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 8061; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 8062; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 8063; AVX512BW-NEXT: kmovd %eax, %k2 8064; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} 8065; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 8066; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 8067; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 8068; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 8069; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 8070; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 8071; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 8072; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 8073; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} 8074; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 8075; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 8076; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 8077; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 8078; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} 8079; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 8080; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 8081; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 8082; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} 8083; AVX512BW-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 8084; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 8085; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 8086; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} 8087; AVX512BW-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 8088; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 8089; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 8090; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} 8091; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 8092; AVX512BW-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 8093; AVX512BW-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 8094; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} 8095; AVX512BW-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 8096; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) 8097; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) 8098; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) 8099; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) 8100; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) 8101; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) 8102; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) 8103; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) 8104; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) 8105; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) 8106; AVX512BW-NEXT: vzeroupper 8107; AVX512BW-NEXT: retq 8108; 8109; AVX512BW-FCP-LABEL: store_i16_stride5_vf64: 8110; AVX512BW-FCP: # %bb.0: 8111; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 8112; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 8113; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 8114; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 8115; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 8116; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 8117; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 8118; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 8119; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 8120; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 8121; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 8122; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 8123; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 8124; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 8125; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 8126; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 8127; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 8128; AVX512BW-FCP-NEXT: kmovd %eax, %k1 8129; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} 8130; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 8131; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 8132; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 8133; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 8134; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 8135; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 8136; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 8137; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 8138; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 8139; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 8140; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 8141; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 8142; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 8143; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 8144; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 8145; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 8146; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 8147; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 8148; AVX512BW-FCP-NEXT: kmovd %eax, %k3 8149; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} 8150; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 8151; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 8152; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 8153; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 8154; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 8155; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 8156; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 8157; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 8158; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 8159; AVX512BW-FCP-NEXT: kmovd %eax, %k2 8160; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} 8161; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 8162; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 8163; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 8164; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 8165; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 8166; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 8167; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 8168; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 8169; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} 8170; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 8171; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 8172; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 8173; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 8174; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} 8175; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 8176; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 8177; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 8178; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} 8179; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 8180; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 8181; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 8182; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} 8183; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 8184; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 8185; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 8186; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} 8187; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 8188; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 8189; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 8190; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} 8191; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 8192; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r9) 8193; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) 8194; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) 8195; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) 8196; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) 8197; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) 8198; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%r9) 8199; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) 8200; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) 8201; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 8202; AVX512BW-FCP-NEXT: vzeroupper 8203; AVX512BW-FCP-NEXT: retq 8204; 8205; AVX512DQ-BW-LABEL: store_i16_stride5_vf64: 8206; AVX512DQ-BW: # %bb.0: 8207; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 8208; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 8209; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4 8210; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm11 8211; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 8212; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 8213; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 8214; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 8215; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 8216; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 8217; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 8218; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 8219; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 8220; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 8221; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 8222; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 8223; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 8224; AVX512DQ-BW-NEXT: kmovd %eax, %k1 8225; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} 8226; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 8227; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 8228; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 8229; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 8230; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 8231; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 8232; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 8233; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 8234; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 8235; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 8236; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 8237; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 8238; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 8239; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 8240; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 8241; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 8242; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 8243; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 8244; AVX512DQ-BW-NEXT: kmovd %eax, %k3 8245; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} 8246; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 8247; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 8248; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 8249; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm24 8250; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 8251; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 8252; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 8253; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 8254; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 8255; AVX512DQ-BW-NEXT: kmovd %eax, %k2 8256; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} 8257; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 8258; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 8259; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 8260; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 8261; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 8262; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 8263; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm29 8264; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 8265; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} 8266; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 8267; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 8268; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 8269; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 8270; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} 8271; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 8272; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 8273; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 8274; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} 8275; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 8276; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 8277; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 8278; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} 8279; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 8280; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 8281; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 8282; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} 8283; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 8284; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 8285; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 8286; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} 8287; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 8288; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%r9) 8289; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%r9) 8290; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%r9) 8291; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%r9) 8292; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%r9) 8293; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 384(%r9) 8294; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%r9) 8295; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 512(%r9) 8296; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 576(%r9) 8297; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) 8298; AVX512DQ-BW-NEXT: vzeroupper 8299; AVX512DQ-BW-NEXT: retq 8300; 8301; AVX512DQ-BW-FCP-LABEL: store_i16_stride5_vf64: 8302; AVX512DQ-BW-FCP: # %bb.0: 8303; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 8304; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 8305; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 8306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 8307; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 8308; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 8309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 8310; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 8311; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 8312; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 8313; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] 8314; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 8315; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 8316; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] 8317; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 8318; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 8319; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 8320; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 8321; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} 8322; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] 8323; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 8324; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] 8325; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 8326; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 8327; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] 8328; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 8329; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 8330; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 8331; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] 8332; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 8333; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] 8334; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 8335; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 8336; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] 8337; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 8338; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 8339; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 8340; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 8341; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} 8342; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] 8343; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 8344; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] 8345; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 8346; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 8347; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] 8348; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 8349; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 8350; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 8351; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 8352; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} 8353; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] 8354; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 8355; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] 8356; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 8357; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 8358; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] 8359; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 8360; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 8361; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} 8362; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] 8363; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 8364; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 8365; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 8366; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} 8367; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 8368; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 8369; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 8370; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} 8371; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 8372; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 8373; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 8374; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} 8375; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 8376; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 8377; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 8378; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} 8379; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 8380; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 8381; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 8382; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} 8383; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 8384; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r9) 8385; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) 8386; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) 8387; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) 8388; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) 8389; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) 8390; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%r9) 8391; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) 8392; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) 8393; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 8394; AVX512DQ-BW-FCP-NEXT: vzeroupper 8395; AVX512DQ-BW-FCP-NEXT: retq 8396 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 8397 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64 8398 %in.vec2 = load <64 x i16>, ptr %in.vecptr2, align 64 8399 %in.vec3 = load <64 x i16>, ptr %in.vecptr3, align 64 8400 %in.vec4 = load <64 x i16>, ptr %in.vecptr4, align 64 8401 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 8402 %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 8403 %3 = shufflevector <128 x i16> %1, <128 x i16> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 8404 %4 = shufflevector <64 x i16> %in.vec4, <64 x i16> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 8405 %5 = shufflevector <256 x i16> %3, <256 x i16> %4, <320 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319> 8406 %interleaved.vec = shufflevector <320 x i16> %5, <320 x i16> poison, <320 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 1, i32 65, i32 129, i32 193, i32 257, i32 2, i32 66, i32 130, i32 194, i32 258, i32 3, i32 67, i32 131, i32 195, i32 259, i32 4, i32 68, i32 132, i32 196, i32 260, i32 5, i32 69, i32 133, i32 197, i32 261, i32 6, i32 70, i32 134, i32 198, i32 262, i32 7, i32 71, i32 135, i32 199, i32 263, i32 8, i32 72, i32 136, i32 200, i32 264, i32 9, i32 73, i32 137, i32 201, i32 265, i32 10, i32 74, i32 138, i32 202, i32 266, i32 11, i32 75, i32 139, i32 203, i32 267, i32 12, i32 76, i32 140, i32 204, i32 268, i32 13, i32 77, i32 141, i32 205, i32 269, i32 14, i32 78, i32 142, i32 206, i32 270, i32 15, i32 79, i32 143, i32 207, i32 271, i32 16, i32 80, i32 144, i32 208, i32 272, i32 17, i32 81, i32 145, i32 209, i32 273, i32 18, i32 82, i32 146, i32 210, i32 274, i32 19, i32 83, i32 147, i32 211, i32 275, i32 20, i32 84, i32 148, i32 212, i32 276, i32 21, i32 85, i32 149, i32 213, i32 277, i32 22, i32 86, i32 150, i32 214, i32 278, i32 23, i32 87, i32 151, i32 215, i32 279, i32 24, i32 88, i32 152, i32 216, i32 280, i32 25, i32 89, i32 153, i32 217, i32 281, i32 26, i32 90, i32 154, i32 218, i32 282, i32 27, i32 91, i32 155, i32 219, i32 283, i32 28, i32 92, i32 156, i32 220, i32 284, i32 29, i32 93, i32 157, i32 221, i32 285, i32 30, i32 94, i32 158, i32 222, i32 286, i32 31, i32 95, i32 159, i32 223, i32 287, i32 32, i32 96, i32 160, i32 224, i32 288, i32 33, i32 97, i32 161, i32 225, i32 289, i32 34, i32 98, i32 162, i32 226, i32 290, i32 35, i32 99, i32 163, i32 227, i32 291, i32 36, i32 100, i32 164, i32 228, i32 292, i32 37, i32 101, i32 165, i32 229, i32 293, i32 38, i32 102, i32 166, i32 230, i32 294, i32 39, i32 103, i32 167, i32 231, i32 295, i32 40, i32 104, i32 168, i32 232, i32 296, i32 41, i32 105, i32 169, i32 233, i32 297, i32 42, i32 106, i32 170, i32 234, i32 298, i32 43, i32 107, i32 171, i32 235, i32 299, i32 44, i32 108, i32 172, i32 236, i32 300, i32 45, i32 109, i32 173, i32 237, i32 301, i32 46, i32 110, i32 174, i32 238, i32 302, i32 47, i32 111, i32 175, i32 239, i32 303, i32 48, i32 112, i32 176, i32 240, i32 304, i32 49, i32 113, i32 177, i32 241, i32 305, i32 50, i32 114, i32 178, i32 242, i32 306, i32 51, i32 115, i32 179, i32 243, i32 307, i32 52, i32 116, i32 180, i32 244, i32 308, i32 53, i32 117, i32 181, i32 245, i32 309, i32 54, i32 118, i32 182, i32 246, i32 310, i32 55, i32 119, i32 183, i32 247, i32 311, i32 56, i32 120, i32 184, i32 248, i32 312, i32 57, i32 121, i32 185, i32 249, i32 313, i32 58, i32 122, i32 186, i32 250, i32 314, i32 59, i32 123, i32 187, i32 251, i32 315, i32 60, i32 124, i32 188, i32 252, i32 316, i32 61, i32 125, i32 189, i32 253, i32 317, i32 62, i32 126, i32 190, i32 254, i32 318, i32 63, i32 127, i32 191, i32 255, i32 319> 8407 store <320 x i16> %interleaved.vec, ptr %out.vec, align 64 8408 ret void 8409} 8410