1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 19; SSE-LABEL: load_i8_stride5_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm1 22; SSE-NEXT: pxor %xmm2, %xmm2 23; SSE-NEXT: movdqa %xmm1, %xmm0 24; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 25; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 26; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 27; SSE-NEXT: packuswb %xmm3, %xmm3 28; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] 29; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 30; SSE-NEXT: packuswb %xmm4, %xmm4 31; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3] 32; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] 33; SSE-NEXT: packuswb %xmm5, %xmm5 34; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] 35; SSE-NEXT: psrlq $48, %xmm0 36; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 37; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 38; SSE-NEXT: packuswb %xmm0, %xmm0 39; SSE-NEXT: psrld $16, %xmm1 40; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 41; SSE-NEXT: packuswb %xmm6, %xmm6 42; SSE-NEXT: movd %xmm3, %eax 43; SSE-NEXT: movw %ax, (%rsi) 44; SSE-NEXT: movd %xmm4, %eax 45; SSE-NEXT: movw %ax, (%rdx) 46; SSE-NEXT: movd %xmm5, %eax 47; SSE-NEXT: movw %ax, (%rcx) 48; SSE-NEXT: movd %xmm0, %eax 49; SSE-NEXT: movw %ax, (%r8) 50; SSE-NEXT: movd %xmm6, %eax 51; SSE-NEXT: movw %ax, (%r9) 52; SSE-NEXT: retq 53; 54; AVX-LABEL: load_i8_stride5_vf2: 55; AVX: # %bb.0: 56; AVX-NEXT: vmovdqa (%rdi), %xmm0 57; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 58; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 59; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 60; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 61; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 62; AVX-NEXT: vpextrw $0, %xmm1, (%rsi) 63; AVX-NEXT: vpextrw $0, %xmm2, (%rdx) 64; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) 65; AVX-NEXT: vpextrw $0, %xmm4, (%r8) 66; AVX-NEXT: vpextrw $0, %xmm0, (%r9) 67; AVX-NEXT: retq 68; 69; AVX2-LABEL: load_i8_stride5_vf2: 70; AVX2: # %bb.0: 71; AVX2-NEXT: vmovdqa (%rdi), %xmm0 72; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 73; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 74; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 75; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 76; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 77; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi) 78; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx) 79; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) 80; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) 81; AVX2-NEXT: vpextrw $0, %xmm0, (%r9) 82; AVX2-NEXT: retq 83; 84; AVX2-FP-LABEL: load_i8_stride5_vf2: 85; AVX2-FP: # %bb.0: 86; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 87; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 88; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 89; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 90; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 91; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 92; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi) 93; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx) 94; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) 95; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) 96; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%r9) 97; AVX2-FP-NEXT: retq 98; 99; AVX2-FCP-LABEL: load_i8_stride5_vf2: 100; AVX2-FCP: # %bb.0: 101; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 102; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 103; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 104; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 105; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 106; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 107; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 108; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 109; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 110; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 111; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%r9) 112; AVX2-FCP-NEXT: retq 113; 114; AVX512-LABEL: load_i8_stride5_vf2: 115; AVX512: # %bb.0: 116; AVX512-NEXT: vmovdqa (%rdi), %xmm0 117; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 118; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 119; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 120; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 121; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 122; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi) 123; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx) 124; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) 125; AVX512-NEXT: vpextrw $0, %xmm4, (%r8) 126; AVX512-NEXT: vpextrw $0, %xmm0, (%r9) 127; AVX512-NEXT: retq 128; 129; AVX512-FCP-LABEL: load_i8_stride5_vf2: 130; AVX512-FCP: # %bb.0: 131; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 132; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 133; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 134; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 135; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 136; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 137; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 138; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 139; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 140; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 141; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%r9) 142; AVX512-FCP-NEXT: retq 143; 144; AVX512DQ-LABEL: load_i8_stride5_vf2: 145; AVX512DQ: # %bb.0: 146; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 147; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 148; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 149; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 150; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 151; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 152; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi) 153; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx) 154; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) 155; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8) 156; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%r9) 157; AVX512DQ-NEXT: retq 158; 159; AVX512DQ-FCP-LABEL: load_i8_stride5_vf2: 160; AVX512DQ-FCP: # %bb.0: 161; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 162; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 163; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 164; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 165; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 166; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 167; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 168; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 169; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 170; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 171; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%r9) 172; AVX512DQ-FCP-NEXT: retq 173; 174; AVX512BW-LABEL: load_i8_stride5_vf2: 175; AVX512BW: # %bb.0: 176; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 177; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 178; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 179; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 180; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 181; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 182; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi) 183; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx) 184; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) 185; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8) 186; AVX512BW-NEXT: vpextrw $0, %xmm0, (%r9) 187; AVX512BW-NEXT: retq 188; 189; AVX512BW-FCP-LABEL: load_i8_stride5_vf2: 190; AVX512BW-FCP: # %bb.0: 191; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 192; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 193; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 194; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 195; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 196; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 197; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 198; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 199; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 200; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 201; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%r9) 202; AVX512BW-FCP-NEXT: retq 203; 204; AVX512DQ-BW-LABEL: load_i8_stride5_vf2: 205; AVX512DQ-BW: # %bb.0: 206; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 207; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 208; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 209; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 210; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 211; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 212; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi) 213; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx) 214; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) 215; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8) 216; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%r9) 217; AVX512DQ-BW-NEXT: retq 218; 219; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf2: 220; AVX512DQ-BW-FCP: # %bb.0: 221; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 222; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 223; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 224; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 225; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 226; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 227; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 228; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 229; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 230; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 231; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%r9) 232; AVX512DQ-BW-FCP-NEXT: retq 233 %wide.vec = load <10 x i8>, ptr %in.vec, align 64 234 %strided.vec0 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 0, i32 5> 235 %strided.vec1 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 1, i32 6> 236 %strided.vec2 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 2, i32 7> 237 %strided.vec3 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 3, i32 8> 238 %strided.vec4 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 4, i32 9> 239 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64 240 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64 241 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64 242 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64 243 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64 244 ret void 245} 246 247define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 248; SSE-LABEL: load_i8_stride5_vf4: 249; SSE: # %bb.0: 250; SSE-NEXT: movdqa (%rdi), %xmm5 251; SSE-NEXT: movdqa 16(%rdi), %xmm0 252; SSE-NEXT: pxor %xmm4, %xmm4 253; SSE-NEXT: movdqa %xmm5, %xmm2 254; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 255; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] 256; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,2,3,4,5,6,7] 257; SSE-NEXT: movdqa %xmm5, %xmm3 258; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 259; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 260; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 261; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 262; SSE-NEXT: packuswb %xmm1, %xmm1 263; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 264; SSE-NEXT: movdqa %xmm5, %xmm7 265; SSE-NEXT: pand %xmm6, %xmm7 266; SSE-NEXT: pandn %xmm0, %xmm6 267; SSE-NEXT: por %xmm7, %xmm6 268; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 269; SSE-NEXT: movdqa %xmm2, %xmm7 270; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] 271; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] 272; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1,1,3] 273; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7] 274; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 275; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] 276; SSE-NEXT: packuswb %xmm6, %xmm6 277; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,65535] 278; SSE-NEXT: movdqa %xmm5, %xmm8 279; SSE-NEXT: pand %xmm7, %xmm8 280; SSE-NEXT: pandn %xmm0, %xmm7 281; SSE-NEXT: por %xmm8, %xmm7 282; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 283; SSE-NEXT: movdqa %xmm2, %xmm8 284; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] 285; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] 286; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] 287; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 288; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 289; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] 290; SSE-NEXT: packuswb %xmm7, %xmm7 291; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] 292; SSE-NEXT: pand %xmm8, %xmm5 293; SSE-NEXT: pandn %xmm0, %xmm8 294; SSE-NEXT: por %xmm5, %xmm8 295; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 296; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[2,0] 297; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,7] 298; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 299; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7] 300; SSE-NEXT: packuswb %xmm4, %xmm4 301; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] 302; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] 303; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 304; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 305; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 306; SSE-NEXT: packuswb %xmm2, %xmm2 307; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 308; SSE-NEXT: pand %xmm3, %xmm2 309; SSE-NEXT: pandn %xmm0, %xmm3 310; SSE-NEXT: por %xmm2, %xmm3 311; SSE-NEXT: movd %xmm1, (%rsi) 312; SSE-NEXT: movd %xmm6, (%rdx) 313; SSE-NEXT: movd %xmm7, (%rcx) 314; SSE-NEXT: movd %xmm4, (%r8) 315; SSE-NEXT: movd %xmm3, (%r9) 316; SSE-NEXT: retq 317; 318; AVX-LABEL: load_i8_stride5_vf4: 319; AVX: # %bb.0: 320; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 321; AVX-NEXT: vmovdqa (%rdi), %xmm1 322; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 323; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3 324; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 325; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 326; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 327; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 328; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 329; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 330; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 331; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 332; AVX-NEXT: vmovd %xmm3, (%rsi) 333; AVX-NEXT: vmovd %xmm4, (%rdx) 334; AVX-NEXT: vmovd %xmm5, (%rcx) 335; AVX-NEXT: vmovd %xmm6, (%r8) 336; AVX-NEXT: vmovd %xmm0, (%r9) 337; AVX-NEXT: retq 338; 339; AVX2-LABEL: load_i8_stride5_vf4: 340; AVX2: # %bb.0: 341; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 342; AVX2-NEXT: vmovdqa (%rdi), %xmm1 343; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 344; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 345; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 346; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4 347; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 348; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 349; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 350; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 351; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 352; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 353; AVX2-NEXT: vmovd %xmm3, (%rsi) 354; AVX2-NEXT: vmovd %xmm4, (%rdx) 355; AVX2-NEXT: vmovd %xmm5, (%rcx) 356; AVX2-NEXT: vmovd %xmm6, (%r8) 357; AVX2-NEXT: vmovd %xmm0, (%r9) 358; AVX2-NEXT: retq 359; 360; AVX2-FP-LABEL: load_i8_stride5_vf4: 361; AVX2-FP: # %bb.0: 362; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 363; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 364; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 365; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 366; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 367; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 368; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 369; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 370; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 371; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 372; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 373; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 374; AVX2-FP-NEXT: vmovd %xmm3, (%rsi) 375; AVX2-FP-NEXT: vmovd %xmm4, (%rdx) 376; AVX2-FP-NEXT: vmovd %xmm5, (%rcx) 377; AVX2-FP-NEXT: vmovd %xmm6, (%r8) 378; AVX2-FP-NEXT: vmovd %xmm0, (%r9) 379; AVX2-FP-NEXT: retq 380; 381; AVX2-FCP-LABEL: load_i8_stride5_vf4: 382; AVX2-FCP: # %bb.0: 383; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 384; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 385; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 386; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 387; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 388; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 389; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 390; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 391; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 392; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 393; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 394; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 395; AVX2-FCP-NEXT: vmovd %xmm3, (%rsi) 396; AVX2-FCP-NEXT: vmovd %xmm4, (%rdx) 397; AVX2-FCP-NEXT: vmovd %xmm5, (%rcx) 398; AVX2-FCP-NEXT: vmovd %xmm6, (%r8) 399; AVX2-FCP-NEXT: vmovd %xmm0, (%r9) 400; AVX2-FCP-NEXT: retq 401; 402; AVX512-LABEL: load_i8_stride5_vf4: 403; AVX512: # %bb.0: 404; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 405; AVX512-NEXT: vmovdqa (%rdi), %xmm1 406; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 407; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm3 408; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 409; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4 410; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 411; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5 412; AVX512-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 413; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm6 414; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 415; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 416; AVX512-NEXT: vmovd %xmm3, (%rsi) 417; AVX512-NEXT: vmovd %xmm4, (%rdx) 418; AVX512-NEXT: vmovd %xmm5, (%rcx) 419; AVX512-NEXT: vmovd %xmm6, (%r8) 420; AVX512-NEXT: vmovd %xmm0, (%r9) 421; AVX512-NEXT: retq 422; 423; AVX512-FCP-LABEL: load_i8_stride5_vf4: 424; AVX512-FCP: # %bb.0: 425; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 426; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 427; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 428; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 429; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 430; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 431; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 432; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 433; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 434; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 435; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 436; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 437; AVX512-FCP-NEXT: vmovd %xmm3, (%rsi) 438; AVX512-FCP-NEXT: vmovd %xmm4, (%rdx) 439; AVX512-FCP-NEXT: vmovd %xmm5, (%rcx) 440; AVX512-FCP-NEXT: vmovd %xmm6, (%r8) 441; AVX512-FCP-NEXT: vmovd %xmm0, (%r9) 442; AVX512-FCP-NEXT: retq 443; 444; AVX512DQ-LABEL: load_i8_stride5_vf4: 445; AVX512DQ: # %bb.0: 446; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 447; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 448; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 449; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm3 450; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 451; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4 452; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 453; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5 454; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 455; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 456; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 457; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 458; AVX512DQ-NEXT: vmovd %xmm3, (%rsi) 459; AVX512DQ-NEXT: vmovd %xmm4, (%rdx) 460; AVX512DQ-NEXT: vmovd %xmm5, (%rcx) 461; AVX512DQ-NEXT: vmovd %xmm6, (%r8) 462; AVX512DQ-NEXT: vmovd %xmm0, (%r9) 463; AVX512DQ-NEXT: retq 464; 465; AVX512DQ-FCP-LABEL: load_i8_stride5_vf4: 466; AVX512DQ-FCP: # %bb.0: 467; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 468; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 469; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 470; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 471; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 472; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 473; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 474; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 475; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 476; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 477; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 478; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 479; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rsi) 480; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rdx) 481; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%rcx) 482; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r8) 483; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r9) 484; AVX512DQ-FCP-NEXT: retq 485; 486; AVX512BW-LABEL: load_i8_stride5_vf4: 487; AVX512BW: # %bb.0: 488; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 489; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 490; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 491; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 492; AVX512BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 493; AVX512BW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 494; AVX512BW-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 495; AVX512BW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 496; AVX512BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 497; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 498; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 499; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 500; AVX512BW-NEXT: vmovd %xmm3, (%rsi) 501; AVX512BW-NEXT: vmovd %xmm4, (%rdx) 502; AVX512BW-NEXT: vmovd %xmm5, (%rcx) 503; AVX512BW-NEXT: vmovd %xmm6, (%r8) 504; AVX512BW-NEXT: vmovd %xmm0, (%r9) 505; AVX512BW-NEXT: retq 506; 507; AVX512BW-FCP-LABEL: load_i8_stride5_vf4: 508; AVX512BW-FCP: # %bb.0: 509; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 510; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 511; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 512; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 513; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 514; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 515; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 516; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 517; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 518; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 519; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 520; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 521; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rsi) 522; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rdx) 523; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx) 524; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r8) 525; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r9) 526; AVX512BW-FCP-NEXT: retq 527; 528; AVX512DQ-BW-LABEL: load_i8_stride5_vf4: 529; AVX512DQ-BW: # %bb.0: 530; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 531; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 532; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 533; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 534; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 535; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 536; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 537; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 538; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 539; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 540; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 541; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 542; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rsi) 543; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rdx) 544; AVX512DQ-BW-NEXT: vmovd %xmm5, (%rcx) 545; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r8) 546; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r9) 547; AVX512DQ-BW-NEXT: retq 548; 549; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf4: 550; AVX512DQ-BW-FCP: # %bb.0: 551; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] 552; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 553; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 554; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 555; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] 556; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 557; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] 558; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 559; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] 560; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 561; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] 562; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 563; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rsi) 564; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rdx) 565; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx) 566; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r8) 567; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r9) 568; AVX512DQ-BW-FCP-NEXT: retq 569 %wide.vec = load <20 x i8>, ptr %in.vec, align 64 570 %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15> 571 %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16> 572 %strided.vec2 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17> 573 %strided.vec3 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18> 574 %strided.vec4 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19> 575 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64 576 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64 577 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64 578 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64 579 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64 580 ret void 581} 582 583define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 584; SSE-LABEL: load_i8_stride5_vf8: 585; SSE: # %bb.0: 586; SSE-NEXT: movdqa (%rdi), %xmm4 587; SSE-NEXT: movdqa 16(%rdi), %xmm3 588; SSE-NEXT: movdqa 32(%rdi), %xmm0 589; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 590; SSE-NEXT: movdqa %xmm1, %xmm2 591; SSE-NEXT: pandn %xmm3, %xmm2 592; SSE-NEXT: movdqa %xmm4, %xmm5 593; SSE-NEXT: pand %xmm1, %xmm5 594; SSE-NEXT: por %xmm2, %xmm5 595; SSE-NEXT: pxor %xmm6, %xmm6 596; SSE-NEXT: movdqa %xmm5, %xmm2 597; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 598; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] 599; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] 600; SSE-NEXT: pand %xmm7, %xmm5 601; SSE-NEXT: pandn %xmm2, %xmm7 602; SSE-NEXT: por %xmm5, %xmm7 603; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7] 604; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 605; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 606; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] 607; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7] 608; SSE-NEXT: packuswb %xmm7, %xmm7 609; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] 610; SSE-NEXT: pand %xmm2, %xmm7 611; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] 612; SSE-NEXT: movdqa %xmm2, %xmm5 613; SSE-NEXT: pandn %xmm8, %xmm5 614; SSE-NEXT: por %xmm7, %xmm5 615; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 616; SSE-NEXT: movdqa %xmm3, %xmm8 617; SSE-NEXT: pand %xmm7, %xmm8 618; SSE-NEXT: pandn %xmm4, %xmm7 619; SSE-NEXT: por %xmm8, %xmm7 620; SSE-NEXT: movdqa %xmm7, %xmm8 621; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 622; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0] 623; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 624; SSE-NEXT: pand %xmm9, %xmm7 625; SSE-NEXT: pandn %xmm8, %xmm9 626; SSE-NEXT: por %xmm7, %xmm9 627; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3] 628; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] 629; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] 630; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] 631; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7] 632; SSE-NEXT: packuswb %xmm7, %xmm7 633; SSE-NEXT: pand %xmm2, %xmm7 634; SSE-NEXT: movdqa %xmm0, %xmm8 635; SSE-NEXT: pslld $24, %xmm8 636; SSE-NEXT: pandn %xmm8, %xmm2 637; SSE-NEXT: por %xmm7, %xmm2 638; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 639; SSE-NEXT: movdqa %xmm3, %xmm8 640; SSE-NEXT: pand %xmm7, %xmm8 641; SSE-NEXT: pandn %xmm4, %xmm7 642; SSE-NEXT: por %xmm8, %xmm7 643; SSE-NEXT: movdqa %xmm7, %xmm9 644; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] 645; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] 646; SSE-NEXT: movdqa %xmm8, %xmm10 647; SSE-NEXT: pandn %xmm9, %xmm10 648; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 649; SSE-NEXT: pand %xmm8, %xmm7 650; SSE-NEXT: por %xmm10, %xmm7 651; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] 652; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 653; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 654; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] 655; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7] 656; SSE-NEXT: packuswb %xmm10, %xmm10 657; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] 658; SSE-NEXT: pand %xmm7, %xmm10 659; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 660; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0] 661; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5] 662; SSE-NEXT: packuswb %xmm11, %xmm11 663; SSE-NEXT: movdqa %xmm7, %xmm9 664; SSE-NEXT: pandn %xmm11, %xmm9 665; SSE-NEXT: por %xmm10, %xmm9 666; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 667; SSE-NEXT: movdqa %xmm3, %xmm11 668; SSE-NEXT: pand %xmm10, %xmm11 669; SSE-NEXT: pandn %xmm4, %xmm10 670; SSE-NEXT: por %xmm11, %xmm10 671; SSE-NEXT: movdqa %xmm10, %xmm11 672; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] 673; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] 674; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0] 675; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] 676; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] 677; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7] 678; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7] 679; SSE-NEXT: packuswb %xmm11, %xmm11 680; SSE-NEXT: pand %xmm7, %xmm11 681; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] 682; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6] 683; SSE-NEXT: packuswb %xmm12, %xmm12 684; SSE-NEXT: movdqa %xmm7, %xmm10 685; SSE-NEXT: pandn %xmm12, %xmm10 686; SSE-NEXT: por %xmm11, %xmm10 687; SSE-NEXT: pand %xmm1, %xmm3 688; SSE-NEXT: pandn %xmm4, %xmm1 689; SSE-NEXT: por %xmm3, %xmm1 690; SSE-NEXT: movdqa %xmm1, %xmm3 691; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 692; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 693; SSE-NEXT: pand %xmm8, %xmm1 694; SSE-NEXT: pandn %xmm3, %xmm8 695; SSE-NEXT: por %xmm1, %xmm8 696; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] 697; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 698; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 699; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 700; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 701; SSE-NEXT: packuswb %xmm1, %xmm1 702; SSE-NEXT: pand %xmm7, %xmm1 703; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 704; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 705; SSE-NEXT: packuswb %xmm0, %xmm0 706; SSE-NEXT: pandn %xmm0, %xmm7 707; SSE-NEXT: por %xmm1, %xmm7 708; SSE-NEXT: movq %xmm5, (%rsi) 709; SSE-NEXT: movq %xmm2, (%rdx) 710; SSE-NEXT: movq %xmm9, (%rcx) 711; SSE-NEXT: movq %xmm10, (%r8) 712; SSE-NEXT: movq %xmm7, (%r9) 713; SSE-NEXT: retq 714; 715; AVX-LABEL: load_i8_stride5_vf8: 716; AVX: # %bb.0: 717; AVX-NEXT: vmovdqa (%rdi), %xmm0 718; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 719; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 720; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] 721; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] 722; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 723; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 724; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u] 725; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 726; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] 727; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 728; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 729; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] 730; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] 731; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] 732; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 733; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 734; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] 735; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] 736; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] 737; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 738; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 739; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] 740; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u] 741; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] 742; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 743; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 744; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u] 745; AVX-NEXT: vmovq %xmm3, (%rsi) 746; AVX-NEXT: vmovq %xmm4, (%rdx) 747; AVX-NEXT: vmovq %xmm5, (%rcx) 748; AVX-NEXT: vmovq %xmm6, (%r8) 749; AVX-NEXT: vmovq %xmm0, (%r9) 750; AVX-NEXT: retq 751; 752; AVX2-LABEL: load_i8_stride5_vf8: 753; AVX2: # %bb.0: 754; AVX2-NEXT: vmovdqa (%rdi), %xmm0 755; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 756; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 757; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 758; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 759; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 760; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 761; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 762; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 763; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 764; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 765; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 766; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 767; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 768; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 769; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 770; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 771; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 772; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 773; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 774; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 775; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 776; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 777; AVX2-NEXT: vmovq %xmm3, (%rsi) 778; AVX2-NEXT: vmovq %xmm4, (%rdx) 779; AVX2-NEXT: vmovq %xmm5, (%rcx) 780; AVX2-NEXT: vmovq %xmm6, (%r8) 781; AVX2-NEXT: vmovq %xmm0, (%r9) 782; AVX2-NEXT: retq 783; 784; AVX2-FP-LABEL: load_i8_stride5_vf8: 785; AVX2-FP: # %bb.0: 786; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 787; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 788; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 789; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 790; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 791; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 792; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 793; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 794; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 795; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 796; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 797; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 798; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 799; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 800; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 801; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 802; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 803; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 804; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 805; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 806; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 807; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 808; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 809; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) 810; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) 811; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) 812; AVX2-FP-NEXT: vmovq %xmm6, (%r8) 813; AVX2-FP-NEXT: vmovq %xmm0, (%r9) 814; AVX2-FP-NEXT: retq 815; 816; AVX2-FCP-LABEL: load_i8_stride5_vf8: 817; AVX2-FCP: # %bb.0: 818; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 819; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 820; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 821; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 822; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 823; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 824; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 825; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 826; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 827; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 828; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 829; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 830; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 831; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 832; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 833; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 834; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 835; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 836; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 837; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 838; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 839; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 840; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 841; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) 842; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) 843; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) 844; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) 845; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) 846; AVX2-FCP-NEXT: retq 847; 848; AVX512-LABEL: load_i8_stride5_vf8: 849; AVX512: # %bb.0: 850; AVX512-NEXT: vmovdqa (%rdi), %xmm0 851; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 852; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 853; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 854; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 855; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 856; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 857; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 858; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 859; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 860; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 861; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 862; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 863; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 864; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 865; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 866; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 867; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 868; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 869; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 870; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 871; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 872; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 873; AVX512-NEXT: vmovq %xmm3, (%rsi) 874; AVX512-NEXT: vmovq %xmm4, (%rdx) 875; AVX512-NEXT: vmovq %xmm5, (%rcx) 876; AVX512-NEXT: vmovq %xmm6, (%r8) 877; AVX512-NEXT: vmovq %xmm0, (%r9) 878; AVX512-NEXT: retq 879; 880; AVX512-FCP-LABEL: load_i8_stride5_vf8: 881; AVX512-FCP: # %bb.0: 882; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 883; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 884; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 885; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 886; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 887; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 888; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 889; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 890; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 891; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 892; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 893; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 894; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 895; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 896; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 897; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 898; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 899; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 900; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 901; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 902; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 903; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 904; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 905; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) 906; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) 907; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) 908; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) 909; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) 910; AVX512-FCP-NEXT: retq 911; 912; AVX512DQ-LABEL: load_i8_stride5_vf8: 913; AVX512DQ: # %bb.0: 914; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 915; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 916; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 917; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 918; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 919; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 920; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 921; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 922; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 923; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 924; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 925; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 926; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 927; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 928; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 929; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 930; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 931; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 932; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 933; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 934; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 935; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 936; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 937; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) 938; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) 939; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) 940; AVX512DQ-NEXT: vmovq %xmm6, (%r8) 941; AVX512DQ-NEXT: vmovq %xmm0, (%r9) 942; AVX512DQ-NEXT: retq 943; 944; AVX512DQ-FCP-LABEL: load_i8_stride5_vf8: 945; AVX512DQ-FCP: # %bb.0: 946; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 947; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 948; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 949; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 950; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 951; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 952; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 953; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 954; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 955; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 956; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 957; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 958; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 959; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 960; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 961; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 962; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 963; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 964; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 965; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 966; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 967; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 968; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 969; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) 970; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) 971; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) 972; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) 973; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) 974; AVX512DQ-FCP-NEXT: retq 975; 976; AVX512BW-LABEL: load_i8_stride5_vf8: 977; AVX512BW: # %bb.0: 978; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 979; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 980; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 981; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 982; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 983; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 984; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 985; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 986; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 987; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 988; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 989; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 990; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 991; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 992; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 993; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 994; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 995; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 996; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 997; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 998; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 999; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 1000; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1001; AVX512BW-NEXT: vmovq %xmm3, (%rsi) 1002; AVX512BW-NEXT: vmovq %xmm4, (%rdx) 1003; AVX512BW-NEXT: vmovq %xmm5, (%rcx) 1004; AVX512BW-NEXT: vmovq %xmm6, (%r8) 1005; AVX512BW-NEXT: vmovq %xmm0, (%r9) 1006; AVX512BW-NEXT: retq 1007; 1008; AVX512BW-FCP-LABEL: load_i8_stride5_vf8: 1009; AVX512BW-FCP: # %bb.0: 1010; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1011; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 1012; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 1013; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 1014; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 1015; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 1016; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 1017; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 1018; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 1019; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 1020; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1021; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1022; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 1023; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 1024; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 1025; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1026; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 1027; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 1028; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1029; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1030; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 1031; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 1032; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1033; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) 1034; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) 1035; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) 1036; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) 1037; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) 1038; AVX512BW-FCP-NEXT: retq 1039; 1040; AVX512DQ-BW-LABEL: load_i8_stride5_vf8: 1041; AVX512DQ-BW: # %bb.0: 1042; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 1043; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 1044; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 1045; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 1046; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 1047; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 1048; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 1049; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 1050; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 1051; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 1052; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 1053; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1054; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 1055; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 1056; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 1057; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1058; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 1059; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 1060; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 1061; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1062; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 1063; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 1064; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1065; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) 1066; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) 1067; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx) 1068; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8) 1069; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) 1070; AVX512DQ-BW-NEXT: retq 1071; 1072; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf8: 1073; AVX512DQ-BW-FCP: # %bb.0: 1074; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1075; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 1076; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 1077; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] 1078; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] 1079; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] 1080; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 1081; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] 1082; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 1083; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] 1084; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1085; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1086; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] 1087; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] 1088; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 1089; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1090; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] 1091; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] 1092; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1093; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] 1094; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] 1095; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] 1096; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1097; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) 1098; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) 1099; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) 1100; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) 1101; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) 1102; AVX512DQ-BW-FCP-NEXT: retq 1103 %wide.vec = load <40 x i8>, ptr %in.vec, align 64 1104 %strided.vec0 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35> 1105 %strided.vec1 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36> 1106 %strided.vec2 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37> 1107 %strided.vec3 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38> 1108 %strided.vec4 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39> 1109 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64 1110 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64 1111 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64 1112 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64 1113 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64 1114 ret void 1115} 1116 1117define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 1118; SSE-LABEL: load_i8_stride5_vf16: 1119; SSE: # %bb.0: 1120; SSE-NEXT: movdqa 64(%rdi), %xmm9 1121; SSE-NEXT: movdqa (%rdi), %xmm1 1122; SSE-NEXT: movdqa 16(%rdi), %xmm15 1123; SSE-NEXT: movdqa 32(%rdi), %xmm10 1124; SSE-NEXT: movdqa 48(%rdi), %xmm2 1125; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1126; SSE-NEXT: movdqa %xmm3, %xmm0 1127; SSE-NEXT: pandn %xmm10, %xmm0 1128; SSE-NEXT: movdqa %xmm2, %xmm4 1129; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1130; SSE-NEXT: pand %xmm3, %xmm4 1131; SSE-NEXT: por %xmm0, %xmm4 1132; SSE-NEXT: pxor %xmm8, %xmm8 1133; SSE-NEXT: movdqa %xmm4, %xmm0 1134; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 1135; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] 1136; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 1137; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 1138; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] 1139; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] 1140; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1141; SSE-NEXT: packuswb %xmm4, %xmm0 1142; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] 1143; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] 1144; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 1145; SSE-NEXT: movdqa %xmm4, %xmm5 1146; SSE-NEXT: pandn %xmm15, %xmm5 1147; SSE-NEXT: movdqa %xmm1, %xmm6 1148; SSE-NEXT: movdqa %xmm1, %xmm13 1149; SSE-NEXT: pand %xmm4, %xmm6 1150; SSE-NEXT: por %xmm5, %xmm6 1151; SSE-NEXT: movdqa %xmm6, %xmm5 1152; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 1153; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] 1154; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] 1155; SSE-NEXT: pand %xmm7, %xmm6 1156; SSE-NEXT: pandn %xmm5, %xmm7 1157; SSE-NEXT: por %xmm6, %xmm7 1158; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] 1159; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] 1160; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] 1161; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] 1162; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] 1163; SSE-NEXT: packuswb %xmm7, %xmm7 1164; SSE-NEXT: pand %xmm11, %xmm7 1165; SSE-NEXT: movdqa %xmm11, %xmm5 1166; SSE-NEXT: pandn %xmm0, %xmm5 1167; SSE-NEXT: por %xmm5, %xmm7 1168; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 1169; SSE-NEXT: pand %xmm6, %xmm7 1170; SSE-NEXT: movdqa %xmm9, %xmm1 1171; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 1172; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1173; SSE-NEXT: movdqa %xmm1, %xmm0 1174; SSE-NEXT: movdqa %xmm1, %xmm5 1175; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1176; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] 1177; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] 1178; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 1179; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 1180; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] 1181; SSE-NEXT: packuswb %xmm0, %xmm0 1182; SSE-NEXT: movdqa %xmm6, %xmm1 1183; SSE-NEXT: pandn %xmm0, %xmm1 1184; SSE-NEXT: por %xmm7, %xmm1 1185; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1186; SSE-NEXT: movdqa %xmm4, %xmm7 1187; SSE-NEXT: pandn %xmm10, %xmm7 1188; SSE-NEXT: movdqa %xmm2, %xmm0 1189; SSE-NEXT: pand %xmm4, %xmm0 1190; SSE-NEXT: por %xmm7, %xmm0 1191; SSE-NEXT: movdqa %xmm0, %xmm12 1192; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] 1193; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 1194; SSE-NEXT: movdqa %xmm0, %xmm7 1195; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] 1196; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] 1197; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 1198; SSE-NEXT: movdqa %xmm14, %xmm12 1199; SSE-NEXT: movdqa %xmm13, %xmm1 1200; SSE-NEXT: pandn %xmm13, %xmm12 1201; SSE-NEXT: movdqa %xmm15, %xmm13 1202; SSE-NEXT: movdqa %xmm15, %xmm2 1203; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1204; SSE-NEXT: pand %xmm14, %xmm13 1205; SSE-NEXT: por %xmm12, %xmm13 1206; SSE-NEXT: movdqa %xmm13, %xmm12 1207; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] 1208; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 1209; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0] 1210; SSE-NEXT: pand %xmm15, %xmm13 1211; SSE-NEXT: pandn %xmm12, %xmm15 1212; SSE-NEXT: por %xmm13, %xmm15 1213; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3] 1214; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] 1215; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 1216; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7] 1217; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7] 1218; SSE-NEXT: packuswb %xmm12, %xmm12 1219; SSE-NEXT: pand %xmm11, %xmm12 1220; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] 1221; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] 1222; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] 1223; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] 1224; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] 1225; SSE-NEXT: psllq $48, %xmm0 1226; SSE-NEXT: packuswb %xmm7, %xmm0 1227; SSE-NEXT: movdqa %xmm5, %xmm7 1228; SSE-NEXT: pandn %xmm0, %xmm11 1229; SSE-NEXT: por %xmm11, %xmm12 1230; SSE-NEXT: pand %xmm6, %xmm12 1231; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0] 1232; SSE-NEXT: movaps %xmm9, %xmm0 1233; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] 1234; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1235; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 1236; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 1237; SSE-NEXT: packuswb %xmm0, %xmm0 1238; SSE-NEXT: movdqa %xmm6, %xmm5 1239; SSE-NEXT: pandn %xmm0, %xmm5 1240; SSE-NEXT: por %xmm12, %xmm5 1241; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1242; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 1243; SSE-NEXT: movdqa %xmm12, %xmm0 1244; SSE-NEXT: pandn %xmm1, %xmm0 1245; SSE-NEXT: movdqa %xmm1, %xmm5 1246; SSE-NEXT: pand %xmm12, %xmm2 1247; SSE-NEXT: por %xmm0, %xmm2 1248; SSE-NEXT: movdqa %xmm2, %xmm0 1249; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1250; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] 1251; SSE-NEXT: movdqa %xmm13, %xmm15 1252; SSE-NEXT: pandn %xmm0, %xmm15 1253; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 1254; SSE-NEXT: pand %xmm13, %xmm2 1255; SSE-NEXT: por %xmm15, %xmm2 1256; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] 1257; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1258; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1259; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 1260; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 1261; SSE-NEXT: packuswb %xmm0, %xmm0 1262; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535] 1263; SSE-NEXT: pandn %xmm0, %xmm15 1264; SSE-NEXT: movdqa %xmm4, %xmm0 1265; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 1266; SSE-NEXT: pandn %xmm11, %xmm0 1267; SSE-NEXT: movdqa %xmm11, %xmm7 1268; SSE-NEXT: pand %xmm14, %xmm7 1269; SSE-NEXT: pandn %xmm10, %xmm14 1270; SSE-NEXT: pand %xmm12, %xmm11 1271; SSE-NEXT: pandn %xmm10, %xmm12 1272; SSE-NEXT: pand %xmm4, %xmm10 1273; SSE-NEXT: por %xmm0, %xmm10 1274; SSE-NEXT: movdqa %xmm10, %xmm0 1275; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1276; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 1277; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] 1278; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0] 1279; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2] 1280; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 1281; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1282; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1283; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 1284; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 1285; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 1286; SSE-NEXT: packuswb %xmm0, %xmm1 1287; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1288; SSE-NEXT: por %xmm15, %xmm1 1289; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 1290; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1291; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] 1292; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 1293; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] 1294; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 1295; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 1296; SSE-NEXT: packuswb %xmm0, %xmm0 1297; SSE-NEXT: movdqa %xmm6, %xmm10 1298; SSE-NEXT: pandn %xmm0, %xmm10 1299; SSE-NEXT: pand %xmm6, %xmm1 1300; SSE-NEXT: por %xmm1, %xmm10 1301; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 1302; SSE-NEXT: movdqa %xmm15, %xmm0 1303; SSE-NEXT: pand %xmm3, %xmm0 1304; SSE-NEXT: pandn %xmm5, %xmm3 1305; SSE-NEXT: por %xmm0, %xmm3 1306; SSE-NEXT: movdqa %xmm3, %xmm0 1307; SSE-NEXT: pxor %xmm1, %xmm1 1308; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1309; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1310; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0] 1311; SSE-NEXT: por %xmm7, %xmm14 1312; SSE-NEXT: movdqa %xmm14, %xmm0 1313; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1314; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] 1315; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] 1316; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7] 1317; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] 1318; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] 1319; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 1320; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 1321; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] 1322; SSE-NEXT: packuswb %xmm1, %xmm0 1323; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] 1324; SSE-NEXT: pand %xmm1, %xmm0 1325; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5] 1326; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] 1327; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] 1328; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] 1329; SSE-NEXT: packuswb %xmm7, %xmm7 1330; SSE-NEXT: pandn %xmm7, %xmm1 1331; SSE-NEXT: movaps %xmm9, %xmm7 1332; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0] 1333; SSE-NEXT: por %xmm1, %xmm0 1334; SSE-NEXT: movaps %xmm2, %xmm1 1335; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] 1336; SSE-NEXT: pand %xmm6, %xmm0 1337; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 1338; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 1339; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] 1340; SSE-NEXT: packuswb %xmm1, %xmm1 1341; SSE-NEXT: pandn %xmm1, %xmm6 1342; SSE-NEXT: por %xmm0, %xmm6 1343; SSE-NEXT: por %xmm11, %xmm12 1344; SSE-NEXT: movdqa %xmm12, %xmm1 1345; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1346; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 1347; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] 1348; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] 1349; SSE-NEXT: movdqa %xmm15, %xmm1 1350; SSE-NEXT: pand %xmm4, %xmm1 1351; SSE-NEXT: pandn %xmm5, %xmm4 1352; SSE-NEXT: por %xmm1, %xmm4 1353; SSE-NEXT: movdqa %xmm4, %xmm1 1354; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 1355; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] 1356; SSE-NEXT: pand %xmm13, %xmm4 1357; SSE-NEXT: pandn %xmm1, %xmm13 1358; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] 1359; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] 1360; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 1361; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1362; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] 1363; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 1364; SSE-NEXT: packuswb %xmm1, %xmm0 1365; SSE-NEXT: por %xmm4, %xmm13 1366; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] 1367; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7] 1368; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] 1369; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 1370; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] 1371; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] 1372; SSE-NEXT: packuswb %xmm4, %xmm4 1373; SSE-NEXT: pand %xmm3, %xmm4 1374; SSE-NEXT: pandn %xmm0, %xmm3 1375; SSE-NEXT: por %xmm3, %xmm4 1376; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 1377; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] 1378; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 1379; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1380; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1381; SSE-NEXT: packuswb %xmm1, %xmm2 1382; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1] 1383; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1384; SSE-NEXT: movaps %xmm0, (%rsi) 1385; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1386; SSE-NEXT: movaps %xmm0, (%rdx) 1387; SSE-NEXT: movdqa %xmm10, (%rcx) 1388; SSE-NEXT: movdqa %xmm6, (%r8) 1389; SSE-NEXT: movaps %xmm4, (%r9) 1390; SSE-NEXT: retq 1391; 1392; AVX-LABEL: load_i8_stride5_vf16: 1393; AVX: # %bb.0: 1394; AVX-NEXT: vmovdqa (%rdi), %xmm0 1395; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 1396; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 1397; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 1398; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] 1399; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] 1400; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1401; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u] 1402; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u] 1403; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 1404; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 1405; AVX-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4 1406; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1407; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm5 1408; AVX-NEXT: vmovdqa 64(%rdi), %xmm4 1409; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] 1410; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 1411; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 1412; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u] 1413; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 1414; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u] 1415; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u] 1416; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 1417; AVX-NEXT: vpblendvb %xmm6, %xmm8, %xmm9, %xmm6 1418; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 1419; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] 1420; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 1421; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] 1422; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm9 1423; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm8 1424; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6,7] 1425; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] 1426; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 1427; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9 1428; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7] 1429; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 1430; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm8 1431; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] 1432; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 1433; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] 1434; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm10 1435; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9 1436; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7] 1437; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] 1438; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm11 1439; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm10 1440; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5,6,7] 1441; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 1442; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm7 1443; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] 1444; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 1445; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] 1446; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 1447; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2 1448; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] 1449; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] 1450; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1451; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1452; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] 1453; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1454; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1455; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 1456; AVX-NEXT: vmovdqa %xmm5, (%rsi) 1457; AVX-NEXT: vmovdqa %xmm6, (%rdx) 1458; AVX-NEXT: vmovdqa %xmm8, (%rcx) 1459; AVX-NEXT: vmovdqa %xmm7, (%r8) 1460; AVX-NEXT: vmovdqa %xmm0, (%r9) 1461; AVX-NEXT: retq 1462; 1463; AVX2-LABEL: load_i8_stride5_vf16: 1464; AVX2: # %bb.0: 1465; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1466; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1467; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1468; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1469; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1470; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1471; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1472; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 1473; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1474; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm3 1475; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 1476; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] 1477; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 1478; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1479; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 1480; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 1481; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 1482; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 1483; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 1484; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1485; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] 1486; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 1487; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1488; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1489; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 1490; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 1491; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 1492; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 1493; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1494; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] 1495; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 1496; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1497; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 1498; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 1499; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 1500; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 1501; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 1502; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 1503; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] 1504; AVX2-NEXT: vpor %xmm7, %xmm4, %xmm4 1505; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 1506; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 1507; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1508; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 1509; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] 1510; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1511; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1512; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1513; AVX2-NEXT: vmovdqa %xmm3, (%rsi) 1514; AVX2-NEXT: vmovdqa %xmm5, (%rdx) 1515; AVX2-NEXT: vmovdqa %xmm6, (%rcx) 1516; AVX2-NEXT: vmovdqa %xmm4, (%r8) 1517; AVX2-NEXT: vmovdqa %xmm0, (%r9) 1518; AVX2-NEXT: vzeroupper 1519; AVX2-NEXT: retq 1520; 1521; AVX2-FP-LABEL: load_i8_stride5_vf16: 1522; AVX2-FP: # %bb.0: 1523; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1524; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1525; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1526; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1527; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 1528; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1529; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1530; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 1531; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1532; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 1533; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 1534; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] 1535; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 1536; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1537; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 1538; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 1539; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 1540; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 1541; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 1542; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1543; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] 1544; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 1545; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1546; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1547; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 1548; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 1549; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 1550; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 1551; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1552; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] 1553; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 1554; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1555; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 1556; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 1557; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm7 1558; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 1559; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 1560; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 1561; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] 1562; AVX2-FP-NEXT: vpor %xmm7, %xmm4, %xmm4 1563; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 1564; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 1565; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 1566; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 1567; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] 1568; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 1569; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1570; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1571; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) 1572; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) 1573; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx) 1574; AVX2-FP-NEXT: vmovdqa %xmm4, (%r8) 1575; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9) 1576; AVX2-FP-NEXT: vzeroupper 1577; AVX2-FP-NEXT: retq 1578; 1579; AVX2-FCP-LABEL: load_i8_stride5_vf16: 1580; AVX2-FCP: # %bb.0: 1581; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1582; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1583; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1584; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1585; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1586; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1587; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1588; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1589; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1590; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 1591; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 1592; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] 1593; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1594; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1595; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 1596; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 1597; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 1598; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 1599; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1600; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1601; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] 1602; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1603; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 1604; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1605; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1606; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 1607; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 1608; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1609; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1610; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] 1611; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1612; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 1613; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 1614; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 1615; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 1616; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 1617; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1618; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 1619; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] 1620; AVX2-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 1621; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 1622; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 1623; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1624; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 1625; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] 1626; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1627; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1628; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1629; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) 1630; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1631; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx) 1632; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r8) 1633; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9) 1634; AVX2-FCP-NEXT: vzeroupper 1635; AVX2-FCP-NEXT: retq 1636; 1637; AVX512-LABEL: load_i8_stride5_vf16: 1638; AVX512: # %bb.0: 1639; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 1640; AVX512-NEXT: vmovdqa (%rdi), %ymm4 1641; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 1642; AVX512-NEXT: vmovdqa %ymm1, %ymm0 1643; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5)) 1644; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 1645; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] 1646; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] 1647; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 1648; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1649; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm2 1650; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 1651; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] 1652; AVX512-NEXT: vpor %xmm6, %xmm2, %xmm6 1653; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 1654; AVX512-NEXT: vmovdqa %ymm2, %ymm7 1655; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 1656; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] 1657; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 1658; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] 1659; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 1660; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm7 1661; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] 1662; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 1663; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 1664; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) 1665; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 1666; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] 1667; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] 1668; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8 1669; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm8 1670; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] 1671; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8 1672; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) 1673; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] 1674; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 1675; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] 1676; AVX512-NEXT: vpor %xmm1, %xmm9, %xmm1 1677; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1678; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] 1679; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 1680; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) 1681; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 1682; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 1683; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 1684; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 1685; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1686; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] 1687; AVX512-NEXT: vmovdqa %xmm6, (%rsi) 1688; AVX512-NEXT: vmovdqa %xmm7, (%rdx) 1689; AVX512-NEXT: vmovdqa %xmm8, (%rcx) 1690; AVX512-NEXT: vmovdqa %xmm1, (%r8) 1691; AVX512-NEXT: vmovdqa %xmm0, (%r9) 1692; AVX512-NEXT: vzeroupper 1693; AVX512-NEXT: retq 1694; 1695; AVX512-FCP-LABEL: load_i8_stride5_vf16: 1696; AVX512-FCP: # %bb.0: 1697; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 1698; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 1699; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 1700; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 1701; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5)) 1702; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 1703; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] 1704; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] 1705; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 1706; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1707; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2 1708; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 1709; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] 1710; AVX512-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6 1711; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 1712; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm7 1713; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 1714; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] 1715; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 1716; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] 1717; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1718; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 1719; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] 1720; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1721; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 1722; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) 1723; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 1724; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] 1725; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] 1726; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 1727; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 1728; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] 1729; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 1730; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) 1731; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] 1732; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 1733; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] 1734; AVX512-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 1735; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1736; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] 1737; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 1738; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) 1739; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1740; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 1741; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 1742; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1743; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1744; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] 1745; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsi) 1746; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rdx) 1747; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx) 1748; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8) 1749; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r9) 1750; AVX512-FCP-NEXT: vzeroupper 1751; AVX512-FCP-NEXT: retq 1752; 1753; AVX512DQ-LABEL: load_i8_stride5_vf16: 1754; AVX512DQ: # %bb.0: 1755; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 1756; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 1757; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 1758; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0 1759; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5)) 1760; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 1761; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] 1762; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] 1763; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 1764; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1765; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm2 1766; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 1767; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] 1768; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm6 1769; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 1770; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm7 1771; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 1772; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] 1773; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7 1774; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] 1775; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 1776; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm7 1777; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] 1778; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 1779; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 1780; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) 1781; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 1782; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] 1783; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] 1784; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8 1785; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm8 1786; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] 1787; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8 1788; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) 1789; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] 1790; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 1791; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] 1792; AVX512DQ-NEXT: vpor %xmm1, %xmm9, %xmm1 1793; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1794; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] 1795; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 1796; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) 1797; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 1798; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 1799; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 1800; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 1801; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1802; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] 1803; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi) 1804; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx) 1805; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx) 1806; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8) 1807; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9) 1808; AVX512DQ-NEXT: vzeroupper 1809; AVX512DQ-NEXT: retq 1810; 1811; AVX512DQ-FCP-LABEL: load_i8_stride5_vf16: 1812; AVX512DQ-FCP: # %bb.0: 1813; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 1814; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 1815; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 1816; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0 1817; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5)) 1818; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 1819; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] 1820; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] 1821; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 1822; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1823; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2 1824; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 1825; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] 1826; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6 1827; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 1828; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm7 1829; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 1830; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] 1831; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 1832; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] 1833; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1834; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 1835; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] 1836; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1837; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 1838; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) 1839; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 1840; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] 1841; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] 1842; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 1843; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 1844; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] 1845; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 1846; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) 1847; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] 1848; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 1849; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] 1850; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 1851; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1852; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] 1853; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 1854; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) 1855; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1856; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 1857; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 1858; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1859; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1860; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] 1861; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsi) 1862; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rdx) 1863; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx) 1864; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8) 1865; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r9) 1866; AVX512DQ-FCP-NEXT: vzeroupper 1867; AVX512DQ-FCP-NEXT: retq 1868; 1869; AVX512BW-LABEL: load_i8_stride5_vf16: 1870; AVX512BW: # %bb.0: 1871; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1872; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 1873; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52 1874; AVX512BW-NEXT: kmovd %eax, %k1 1875; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} 1876; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1877; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1878; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1879; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1880; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1881; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1882; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 1883; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] 1884; AVX512BW-NEXT: vpor %xmm5, %xmm2, %xmm2 1885; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294 1886; AVX512BW-NEXT: kmovd %eax, %k2 1887; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} 1888; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 1889; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 1890; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 1891; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 1892; AVX512BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 1893; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] 1894; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 1895; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A 1896; AVX512BW-NEXT: kmovd %eax, %k3 1897; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3} 1898; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 1899; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 1900; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 1901; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 1902; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 1903; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] 1904; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 1905; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} 1906; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 1907; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 1908; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 1909; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 1910; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 1911; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] 1912; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3 1913; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} 1914; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 1915; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] 1916; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] 1917; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 1918; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1919; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1920; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) 1921; AVX512BW-NEXT: vmovdqa %xmm5, (%rdx) 1922; AVX512BW-NEXT: vmovdqa %xmm6, (%rcx) 1923; AVX512BW-NEXT: vmovdqa %xmm3, (%r8) 1924; AVX512BW-NEXT: vmovdqa %xmm0, (%r9) 1925; AVX512BW-NEXT: vzeroupper 1926; AVX512BW-NEXT: retq 1927; 1928; AVX512BW-FCP-LABEL: load_i8_stride5_vf16: 1929; AVX512BW-FCP: # %bb.0: 1930; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1931; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1932; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 1933; AVX512BW-FCP-NEXT: kmovd %eax, %k1 1934; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} 1935; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1936; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1937; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1938; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1939; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1940; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1941; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 1942; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] 1943; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 1944; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 1945; AVX512BW-FCP-NEXT: kmovd %eax, %k2 1946; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} 1947; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 1948; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 1949; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 1950; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1951; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 1952; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] 1953; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1954; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 1955; AVX512BW-FCP-NEXT: kmovd %eax, %k3 1956; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3} 1957; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1958; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 1959; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 1960; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1961; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 1962; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] 1963; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1964; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} 1965; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 1966; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 1967; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 1968; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1969; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 1970; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] 1971; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 1972; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} 1973; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 1974; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] 1975; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] 1976; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 1977; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 1978; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1979; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1980; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1981; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rcx) 1982; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8) 1983; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 1984; AVX512BW-FCP-NEXT: vzeroupper 1985; AVX512BW-FCP-NEXT: retq 1986; 1987; AVX512DQ-BW-LABEL: load_i8_stride5_vf16: 1988; AVX512DQ-BW: # %bb.0: 1989; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 1990; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 1991; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52 1992; AVX512DQ-BW-NEXT: kmovd %eax, %k1 1993; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} 1994; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1995; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 1996; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 1997; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1998; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 1999; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2000; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4 2001; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] 2002; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm2, %xmm2 2003; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294 2004; AVX512DQ-BW-NEXT: kmovd %eax, %k2 2005; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} 2006; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 2007; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 2008; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 2009; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 2010; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 2011; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] 2012; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 2013; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A 2014; AVX512DQ-BW-NEXT: kmovd %eax, %k3 2015; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3} 2016; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 2017; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 2018; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 2019; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 2020; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 2021; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] 2022; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 2023; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} 2024; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 2025; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 2026; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 2027; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 2028; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 2029; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] 2030; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm3, %xmm3 2031; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} 2032; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0 2033; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] 2034; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] 2035; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 2036; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 2037; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2038; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) 2039; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rdx) 2040; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rcx) 2041; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%r8) 2042; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%r9) 2043; AVX512DQ-BW-NEXT: vzeroupper 2044; AVX512DQ-BW-NEXT: retq 2045; 2046; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf16: 2047; AVX512DQ-BW-FCP: # %bb.0: 2048; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 2049; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 2050; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 2051; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 2052; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} 2053; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2054; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] 2055; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] 2056; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 2057; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] 2058; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2059; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 2060; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] 2061; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 2062; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 2063; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 2064; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} 2065; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 2066; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 2067; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 2068; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 2069; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 2070; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] 2071; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 2072; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 2073; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 2074; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3} 2075; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 2076; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 2077; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] 2078; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 2079; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 2080; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] 2081; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 2082; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} 2083; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] 2084; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 2085; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 2086; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 2087; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 2088; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] 2089; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 2090; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} 2091; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 2092; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] 2093; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] 2094; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 2095; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 2096; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2097; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 2098; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rdx) 2099; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rcx) 2100; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8) 2101; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 2102; AVX512DQ-BW-FCP-NEXT: vzeroupper 2103; AVX512DQ-BW-FCP-NEXT: retq 2104 %wide.vec = load <80 x i8>, ptr %in.vec, align 64 2105 %strided.vec0 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75> 2106 %strided.vec1 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76> 2107 %strided.vec2 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77> 2108 %strided.vec3 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78> 2109 %strided.vec4 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79> 2110 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64 2111 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64 2112 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64 2113 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64 2114 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64 2115 ret void 2116} 2117 2118define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 2119; SSE-LABEL: load_i8_stride5_vf32: 2120; SSE: # %bb.0: 2121; SSE-NEXT: subq $184, %rsp 2122; SSE-NEXT: movdqa (%rdi), %xmm9 2123; SSE-NEXT: movdqa 16(%rdi), %xmm3 2124; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2125; SSE-NEXT: movdqa 32(%rdi), %xmm1 2126; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2127; SSE-NEXT: movdqa 48(%rdi), %xmm2 2128; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2129; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 2130; SSE-NEXT: movdqa %xmm4, %xmm0 2131; SSE-NEXT: pandn %xmm1, %xmm0 2132; SSE-NEXT: movdqa %xmm2, %xmm1 2133; SSE-NEXT: pand %xmm4, %xmm1 2134; SSE-NEXT: por %xmm0, %xmm1 2135; SSE-NEXT: pxor %xmm5, %xmm5 2136; SSE-NEXT: movdqa %xmm1, %xmm0 2137; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 2138; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 2139; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 2140; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2141; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 2142; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] 2143; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 2144; SSE-NEXT: packuswb %xmm1, %xmm0 2145; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3] 2146; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] 2147; SSE-NEXT: movdqa %xmm13, %xmm0 2148; SSE-NEXT: pandn %xmm1, %xmm0 2149; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 2150; SSE-NEXT: movdqa %xmm15, %xmm1 2151; SSE-NEXT: pandn %xmm3, %xmm1 2152; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 2153; SSE-NEXT: pandn %xmm9, %xmm11 2154; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2155; SSE-NEXT: movdqa %xmm14, %xmm2 2156; SSE-NEXT: pandn %xmm9, %xmm2 2157; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2158; SSE-NEXT: movdqa %xmm4, %xmm2 2159; SSE-NEXT: pandn %xmm9, %xmm2 2160; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2161; SSE-NEXT: movdqa %xmm15, %xmm2 2162; SSE-NEXT: pandn %xmm9, %xmm2 2163; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2164; SSE-NEXT: pand %xmm15, %xmm9 2165; SSE-NEXT: por %xmm1, %xmm9 2166; SSE-NEXT: movdqa %xmm9, %xmm2 2167; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 2168; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] 2169; SSE-NEXT: movdqa %xmm1, %xmm6 2170; SSE-NEXT: pandn %xmm2, %xmm6 2171; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] 2172; SSE-NEXT: pand %xmm1, %xmm9 2173; SSE-NEXT: por %xmm6, %xmm9 2174; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7] 2175; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 2176; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 2177; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] 2178; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 2179; SSE-NEXT: packuswb %xmm2, %xmm2 2180; SSE-NEXT: pand %xmm13, %xmm2 2181; SSE-NEXT: por %xmm0, %xmm2 2182; SSE-NEXT: movdqa 64(%rdi), %xmm6 2183; SSE-NEXT: movdqa %xmm6, %xmm3 2184; SSE-NEXT: pxor %xmm0, %xmm0 2185; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2186; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2187; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] 2188; SSE-NEXT: movdqa %xmm6, %xmm0 2189; SSE-NEXT: movdqa %xmm6, %xmm8 2190; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2191; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] 2192; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] 2193; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 2194; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 2195; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] 2196; SSE-NEXT: packuswb %xmm0, %xmm0 2197; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 2198; SSE-NEXT: movdqa %xmm9, %xmm6 2199; SSE-NEXT: pandn %xmm0, %xmm6 2200; SSE-NEXT: pand %xmm9, %xmm2 2201; SSE-NEXT: por %xmm2, %xmm6 2202; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2203; SSE-NEXT: movdqa 112(%rdi), %xmm10 2204; SSE-NEXT: movdqa %xmm4, %xmm0 2205; SSE-NEXT: pandn %xmm10, %xmm0 2206; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2207; SSE-NEXT: movdqa 128(%rdi), %xmm7 2208; SSE-NEXT: movdqa %xmm7, %xmm2 2209; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2210; SSE-NEXT: pand %xmm4, %xmm2 2211; SSE-NEXT: por %xmm0, %xmm2 2212; SSE-NEXT: movdqa %xmm2, %xmm0 2213; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 2214; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] 2215; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] 2216; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2217; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 2218; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] 2219; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] 2220; SSE-NEXT: packuswb %xmm2, %xmm0 2221; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] 2222; SSE-NEXT: movdqa %xmm13, %xmm2 2223; SSE-NEXT: movdqa %xmm13, %xmm3 2224; SSE-NEXT: pandn %xmm0, %xmm2 2225; SSE-NEXT: movdqa 96(%rdi), %xmm4 2226; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill 2227; SSE-NEXT: movdqa %xmm15, %xmm0 2228; SSE-NEXT: pandn %xmm4, %xmm0 2229; SSE-NEXT: movdqa 80(%rdi), %xmm6 2230; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2231; SSE-NEXT: pand %xmm15, %xmm6 2232; SSE-NEXT: por %xmm0, %xmm6 2233; SSE-NEXT: movdqa %xmm6, %xmm0 2234; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 2235; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 2236; SSE-NEXT: pand %xmm1, %xmm6 2237; SSE-NEXT: pandn %xmm0, %xmm1 2238; SSE-NEXT: por %xmm6, %xmm1 2239; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7] 2240; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] 2241; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2242; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 2243; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] 2244; SSE-NEXT: packuswb %xmm0, %xmm0 2245; SSE-NEXT: pand %xmm13, %xmm0 2246; SSE-NEXT: por %xmm2, %xmm0 2247; SSE-NEXT: movdqa 144(%rdi), %xmm12 2248; SSE-NEXT: movdqa %xmm12, %xmm2 2249; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 2250; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2251; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] 2252; SSE-NEXT: movdqa %xmm12, %xmm1 2253; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2254; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] 2255; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 2256; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 2257; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2258; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 2259; SSE-NEXT: packuswb %xmm1, %xmm1 2260; SSE-NEXT: movdqa %xmm9, %xmm2 2261; SSE-NEXT: pandn %xmm1, %xmm2 2262; SSE-NEXT: pand %xmm9, %xmm0 2263; SSE-NEXT: por %xmm0, %xmm2 2264; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2265; SSE-NEXT: movdqa %xmm15, %xmm0 2266; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2267; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2268; SSE-NEXT: pand %xmm15, %xmm1 2269; SSE-NEXT: por %xmm0, %xmm1 2270; SSE-NEXT: movdqa %xmm1, %xmm0 2271; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2272; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 2273; SSE-NEXT: movdqa %xmm1, %xmm2 2274; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 2275; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 2276; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] 2277; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] 2278; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] 2279; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 2280; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 2281; SSE-NEXT: psllq $48, %xmm1 2282; SSE-NEXT: packuswb %xmm0, %xmm1 2283; SSE-NEXT: movdqa %xmm13, %xmm2 2284; SSE-NEXT: pandn %xmm1, %xmm2 2285; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2286; SSE-NEXT: movdqa %xmm4, %xmm1 2287; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 2288; SSE-NEXT: pand %xmm13, %xmm1 2289; SSE-NEXT: por %xmm11, %xmm1 2290; SSE-NEXT: movdqa %xmm1, %xmm6 2291; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 2292; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] 2293; SSE-NEXT: movdqa %xmm0, %xmm11 2294; SSE-NEXT: pandn %xmm6, %xmm11 2295; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 2296; SSE-NEXT: pand %xmm0, %xmm1 2297; SSE-NEXT: por %xmm11, %xmm1 2298; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 2299; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 2300; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 2301; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 2302; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] 2303; SSE-NEXT: packuswb %xmm1, %xmm1 2304; SSE-NEXT: pand %xmm3, %xmm1 2305; SSE-NEXT: movdqa %xmm3, %xmm11 2306; SSE-NEXT: por %xmm2, %xmm1 2307; SSE-NEXT: movdqa %xmm8, %xmm2 2308; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2309; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0] 2310; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] 2311; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7] 2312; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 2313; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 2314; SSE-NEXT: packuswb %xmm2, %xmm2 2315; SSE-NEXT: movdqa %xmm9, %xmm3 2316; SSE-NEXT: pandn %xmm2, %xmm3 2317; SSE-NEXT: pand %xmm9, %xmm1 2318; SSE-NEXT: por %xmm1, %xmm3 2319; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2320; SSE-NEXT: movdqa %xmm15, %xmm2 2321; SSE-NEXT: pandn %xmm10, %xmm2 2322; SSE-NEXT: movdqa %xmm7, %xmm1 2323; SSE-NEXT: pand %xmm15, %xmm1 2324; SSE-NEXT: por %xmm2, %xmm1 2325; SSE-NEXT: movdqa %xmm1, %xmm2 2326; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2327; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 2328; SSE-NEXT: movdqa %xmm1, %xmm6 2329; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] 2330; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] 2331; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] 2332; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] 2333; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] 2334; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 2335; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 2336; SSE-NEXT: psllq $48, %xmm1 2337; SSE-NEXT: packuswb %xmm2, %xmm1 2338; SSE-NEXT: movdqa %xmm13, %xmm2 2339; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2340; SSE-NEXT: pandn %xmm7, %xmm2 2341; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload 2342; SSE-NEXT: movdqa %xmm8, %xmm6 2343; SSE-NEXT: pand %xmm13, %xmm6 2344; SSE-NEXT: por %xmm2, %xmm6 2345; SSE-NEXT: movdqa %xmm6, %xmm2 2346; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2347; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 2348; SSE-NEXT: pand %xmm0, %xmm6 2349; SSE-NEXT: pandn %xmm2, %xmm0 2350; SSE-NEXT: por %xmm6, %xmm0 2351; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2352; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 2353; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2354; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 2355; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] 2356; SSE-NEXT: packuswb %xmm0, %xmm0 2357; SSE-NEXT: movdqa %xmm11, %xmm2 2358; SSE-NEXT: pand %xmm11, %xmm0 2359; SSE-NEXT: pandn %xmm1, %xmm2 2360; SSE-NEXT: por %xmm2, %xmm0 2361; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2362; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] 2363; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] 2364; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7] 2365; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 2366; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] 2367; SSE-NEXT: packuswb %xmm1, %xmm1 2368; SSE-NEXT: movdqa %xmm9, %xmm2 2369; SSE-NEXT: pandn %xmm1, %xmm2 2370; SSE-NEXT: pand %xmm9, %xmm0 2371; SSE-NEXT: por %xmm0, %xmm2 2372; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2373; SSE-NEXT: pand %xmm14, %xmm4 2374; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2375; SSE-NEXT: movdqa %xmm4, %xmm2 2376; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2377; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] 2378; SSE-NEXT: movdqa %xmm3, %xmm6 2379; SSE-NEXT: pandn %xmm2, %xmm6 2380; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 2381; SSE-NEXT: pand %xmm3, %xmm4 2382; SSE-NEXT: por %xmm6, %xmm4 2383; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] 2384; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 2385; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2386; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 2387; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2388; SSE-NEXT: packuswb %xmm0, %xmm0 2389; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] 2390; SSE-NEXT: movdqa %xmm1, %xmm2 2391; SSE-NEXT: movdqa %xmm1, %xmm10 2392; SSE-NEXT: pandn %xmm0, %xmm2 2393; SSE-NEXT: movdqa %xmm15, %xmm0 2394; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2395; SSE-NEXT: pandn %xmm1, %xmm0 2396; SSE-NEXT: movdqa %xmm13, %xmm6 2397; SSE-NEXT: movdqa %xmm13, %xmm12 2398; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2399; SSE-NEXT: pandn %xmm11, %xmm6 2400; SSE-NEXT: movdqa %xmm14, %xmm4 2401; SSE-NEXT: pandn %xmm11, %xmm4 2402; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2403; SSE-NEXT: pand %xmm15, %xmm11 2404; SSE-NEXT: movdqa %xmm15, %xmm4 2405; SSE-NEXT: por %xmm0, %xmm11 2406; SSE-NEXT: movdqa %xmm11, %xmm0 2407; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2408; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] 2409; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0] 2410; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0] 2411; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] 2412; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 2413; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 2414; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2415; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 2416; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 2417; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5] 2418; SSE-NEXT: packuswb %xmm0, %xmm11 2419; SSE-NEXT: pand %xmm10, %xmm11 2420; SSE-NEXT: por %xmm2, %xmm11 2421; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2422; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2423; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2424; SSE-NEXT: # xmm2 = mem[0,2,2,3] 2425; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2426; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 2427; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 2428; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 2429; SSE-NEXT: packuswb %xmm0, %xmm0 2430; SSE-NEXT: movdqa %xmm9, %xmm2 2431; SSE-NEXT: pandn %xmm0, %xmm2 2432; SSE-NEXT: pand %xmm9, %xmm11 2433; SSE-NEXT: por %xmm11, %xmm2 2434; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2435; SSE-NEXT: movdqa %xmm14, %xmm0 2436; SSE-NEXT: pandn %xmm7, %xmm0 2437; SSE-NEXT: movdqa %xmm8, %xmm15 2438; SSE-NEXT: movdqa %xmm8, %xmm2 2439; SSE-NEXT: pand %xmm14, %xmm2 2440; SSE-NEXT: por %xmm0, %xmm2 2441; SSE-NEXT: movdqa %xmm2, %xmm0 2442; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2443; SSE-NEXT: movdqa %xmm3, %xmm11 2444; SSE-NEXT: pandn %xmm0, %xmm11 2445; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 2446; SSE-NEXT: pand %xmm3, %xmm2 2447; SSE-NEXT: por %xmm11, %xmm2 2448; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] 2449; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 2450; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2451; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 2452; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2453; SSE-NEXT: packuswb %xmm0, %xmm0 2454; SSE-NEXT: movdqa %xmm10, %xmm13 2455; SSE-NEXT: pandn %xmm0, %xmm13 2456; SSE-NEXT: movdqa %xmm4, %xmm11 2457; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2458; SSE-NEXT: pandn %xmm2, %xmm11 2459; SSE-NEXT: movdqa %xmm1, %xmm5 2460; SSE-NEXT: movdqa %xmm1, %xmm0 2461; SSE-NEXT: movdqa %xmm12, %xmm1 2462; SSE-NEXT: pand %xmm12, %xmm0 2463; SSE-NEXT: movdqa %xmm2, %xmm7 2464; SSE-NEXT: pand %xmm12, %xmm7 2465; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2466; SSE-NEXT: pandn %xmm8, %xmm1 2467; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2468; SSE-NEXT: pand %xmm14, %xmm5 2469; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2470; SSE-NEXT: pand %xmm14, %xmm2 2471; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2472; SSE-NEXT: pandn %xmm8, %xmm14 2473; SSE-NEXT: pand %xmm4, %xmm8 2474; SSE-NEXT: por %xmm11, %xmm8 2475; SSE-NEXT: movdqa %xmm8, %xmm11 2476; SSE-NEXT: pxor %xmm1, %xmm1 2477; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] 2478; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 2479; SSE-NEXT: pxor %xmm2, %xmm2 2480; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] 2481; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0] 2482; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2] 2483; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] 2484; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] 2485; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 2486; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] 2487; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] 2488; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2489; SSE-NEXT: packuswb %xmm8, %xmm1 2490; SSE-NEXT: pand %xmm10, %xmm1 2491; SSE-NEXT: por %xmm13, %xmm1 2492; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2493; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] 2494; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2495; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] 2496; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] 2497; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] 2498; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 2499; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] 2500; SSE-NEXT: packuswb %xmm8, %xmm11 2501; SSE-NEXT: movdqa %xmm9, %xmm12 2502; SSE-NEXT: pandn %xmm11, %xmm12 2503; SSE-NEXT: pand %xmm9, %xmm1 2504; SSE-NEXT: por %xmm1, %xmm12 2505; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2506; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 2507; SSE-NEXT: pand %xmm13, %xmm1 2508; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2509; SSE-NEXT: movdqa %xmm1, %xmm11 2510; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] 2511; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2512; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0] 2513; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2514; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] 2515; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] 2516; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] 2517; SSE-NEXT: packuswb %xmm1, %xmm1 2518; SSE-NEXT: movdqa %xmm10, %xmm11 2519; SSE-NEXT: pandn %xmm1, %xmm11 2520; SSE-NEXT: por %xmm6, %xmm0 2521; SSE-NEXT: movdqa %xmm0, %xmm1 2522; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2523; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2524; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] 2525; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2526; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 2527; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] 2528; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 2529; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2530; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] 2531; SSE-NEXT: packuswb %xmm0, %xmm1 2532; SSE-NEXT: pand %xmm10, %xmm1 2533; SSE-NEXT: por %xmm11, %xmm1 2534; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2535; SSE-NEXT: movaps %xmm10, %xmm0 2536; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2537; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] 2538; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] 2539; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] 2540; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 2541; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] 2542; SSE-NEXT: packuswb %xmm0, %xmm6 2543; SSE-NEXT: movdqa %xmm9, %xmm8 2544; SSE-NEXT: pandn %xmm6, %xmm8 2545; SSE-NEXT: pand %xmm9, %xmm1 2546; SSE-NEXT: por %xmm1, %xmm8 2547; SSE-NEXT: movdqa %xmm13, %xmm0 2548; SSE-NEXT: pand %xmm13, %xmm15 2549; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2550; SSE-NEXT: por %xmm15, %xmm0 2551; SSE-NEXT: movdqa %xmm0, %xmm1 2552; SSE-NEXT: pxor %xmm6, %xmm6 2553; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 2554; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 2555; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] 2556; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2557; SSE-NEXT: por %xmm7, %xmm2 2558; SSE-NEXT: movdqa %xmm2, %xmm1 2559; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 2560; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 2561; SSE-NEXT: pxor %xmm13, %xmm13 2562; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] 2563; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 2564; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 2565; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 2566; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2567; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2568; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] 2569; SSE-NEXT: packuswb %xmm2, %xmm1 2570; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535] 2571; SSE-NEXT: pand %xmm2, %xmm1 2572; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5] 2573; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] 2574; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7] 2575; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7] 2576; SSE-NEXT: packuswb %xmm6, %xmm6 2577; SSE-NEXT: pandn %xmm6, %xmm2 2578; SSE-NEXT: por %xmm2, %xmm1 2579; SSE-NEXT: movdqa %xmm4, %xmm2 2580; SSE-NEXT: movdqa %xmm4, %xmm15 2581; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] 2582; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] 2583; SSE-NEXT: pand %xmm9, %xmm1 2584; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] 2585; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 2586; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] 2587; SSE-NEXT: packuswb %xmm2, %xmm2 2588; SSE-NEXT: pandn %xmm2, %xmm9 2589; SSE-NEXT: por %xmm1, %xmm9 2590; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2591; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2592; SSE-NEXT: movdqa %xmm0, %xmm1 2593; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] 2594; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] 2595; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 2596; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 2597; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 2598; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 2599; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 2600; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2601; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] 2602; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 2603; SSE-NEXT: packuswb %xmm1, %xmm2 2604; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] 2605; SSE-NEXT: movdqa %xmm4, %xmm6 2606; SSE-NEXT: pandn %xmm2, %xmm6 2607; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2608; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 2609; SSE-NEXT: pand %xmm5, %xmm2 2610; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2611; SSE-NEXT: movdqa %xmm2, %xmm0 2612; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] 2613; SSE-NEXT: movdqa %xmm3, %xmm11 2614; SSE-NEXT: pandn %xmm0, %xmm11 2615; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 2616; SSE-NEXT: pand %xmm3, %xmm2 2617; SSE-NEXT: por %xmm11, %xmm2 2618; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] 2619; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] 2620; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 2621; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] 2622; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 2623; SSE-NEXT: packuswb %xmm2, %xmm2 2624; SSE-NEXT: pand %xmm4, %xmm2 2625; SSE-NEXT: por %xmm6, %xmm2 2626; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 2627; SSE-NEXT: # xmm6 = mem[3,1,2,3] 2628; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 2629; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] 2630; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] 2631; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] 2632; SSE-NEXT: packuswb %xmm1, %xmm10 2633; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1] 2634; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 2635; SSE-NEXT: movdqa %xmm14, %xmm1 2636; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] 2637; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 2638; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3] 2639; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2] 2640; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 2641; SSE-NEXT: pand %xmm5, %xmm0 2642; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2643; SSE-NEXT: por %xmm0, %xmm5 2644; SSE-NEXT: movdqa %xmm5, %xmm1 2645; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] 2646; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] 2647; SSE-NEXT: pand %xmm3, %xmm5 2648; SSE-NEXT: pandn %xmm1, %xmm3 2649; SSE-NEXT: por %xmm5, %xmm3 2650; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7] 2651; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 2652; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 2653; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 2654; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 2655; SSE-NEXT: packuswb %xmm1, %xmm1 2656; SSE-NEXT: pand %xmm4, %xmm1 2657; SSE-NEXT: movdqa %xmm4, %xmm7 2658; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1] 2659; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] 2660; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 2661; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2662; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7] 2663; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7] 2664; SSE-NEXT: packuswb %xmm3, %xmm4 2665; SSE-NEXT: pandn %xmm4, %xmm7 2666; SSE-NEXT: por %xmm7, %xmm1 2667; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2668; SSE-NEXT: # xmm4 = mem[3,1,2,3] 2669; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] 2670; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] 2671; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 2672; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 2673; SSE-NEXT: packuswb %xmm3, %xmm5 2674; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1] 2675; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2676; SSE-NEXT: movaps %xmm0, 16(%rsi) 2677; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2678; SSE-NEXT: movaps %xmm0, (%rsi) 2679; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2680; SSE-NEXT: movaps %xmm0, 16(%rdx) 2681; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2682; SSE-NEXT: movaps %xmm0, (%rdx) 2683; SSE-NEXT: movdqa %xmm12, 16(%rcx) 2684; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2685; SSE-NEXT: movaps %xmm0, (%rcx) 2686; SSE-NEXT: movdqa %xmm9, 16(%r8) 2687; SSE-NEXT: movdqa %xmm8, (%r8) 2688; SSE-NEXT: movaps %xmm1, 16(%r9) 2689; SSE-NEXT: movaps %xmm2, (%r9) 2690; SSE-NEXT: addq $184, %rsp 2691; SSE-NEXT: retq 2692; 2693; AVX-LABEL: load_i8_stride5_vf32: 2694; AVX: # %bb.0: 2695; AVX-NEXT: vmovdqa 144(%rdi), %xmm0 2696; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] 2697; AVX-NEXT: vmovdqa 128(%rdi), %xmm1 2698; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 2699; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5 2700; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 2701; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] 2702; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 2703; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] 2704; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 2705; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 2706; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7] 2707; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 2708; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] 2709; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 2710; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] 2711; AVX-NEXT: vmovdqa (%rdi), %xmm7 2712; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 2713; AVX-NEXT: vmovdqa 32(%rdi), %xmm8 2714; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 2715; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] 2716; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] 2717; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] 2718; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] 2719; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] 2720; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 2721; AVX-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 2722; AVX-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 2723; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 2724; AVX-NEXT: vandps %ymm5, %ymm12, %ymm11 2725; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 2726; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] 2727; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 2728; AVX-NEXT: vorps %ymm14, %ymm11, %ymm11 2729; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 2730; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2731; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] 2732; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 2733; AVX-NEXT: vpor %xmm11, %xmm14, %xmm11 2734; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] 2735; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] 2736; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2737; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] 2738; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] 2739; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6 2740; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 2741; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] 2742; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] 2743; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 2744; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] 2745; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 2746; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] 2747; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 2748; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7] 2749; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 2750; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] 2751; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 2752; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 2753; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 2754; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2755; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] 2756; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm14 2757; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 2758; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7] 2759; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] 2760; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 2761; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm14 2762; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7] 2763; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6 2764; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 2765; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] 2766; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 2767; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6 2768; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] 2769; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 2770; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 2771; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] 2772; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] 2773; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2774; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm13 2775; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] 2776; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 2777; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7] 2778; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm12 2779; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] 2780; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm13 2781; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 2782; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7] 2783; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] 2784; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm14 2785; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm13 2786; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] 2787; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6 2788; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] 2789; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] 2790; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 2791; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] 2792; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 2793; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] 2794; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 2795; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] 2796; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 2797; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] 2798; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 2799; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14 2800; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 2801; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] 2802; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 2803; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2804; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15 2805; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] 2806; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm14, %xmm14 2807; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm6 2808; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] 2809; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 2810; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 2811; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] 2812; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] 2813; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 2814; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 2815; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] 2816; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 2817; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] 2818; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] 2819; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 2820; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 2821; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] 2822; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 2823; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero 2824; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] 2825; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2826; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 2827; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] 2828; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2829; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] 2830; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2831; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2832; AVX-NEXT: vmovaps %ymm1, (%rsi) 2833; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2834; AVX-NEXT: vmovaps %ymm1, (%rdx) 2835; AVX-NEXT: vmovaps %ymm12, (%rcx) 2836; AVX-NEXT: vmovaps %ymm6, (%r8) 2837; AVX-NEXT: vmovaps %ymm0, (%r9) 2838; AVX-NEXT: vzeroupper 2839; AVX-NEXT: retq 2840; 2841; AVX2-LABEL: load_i8_stride5_vf32: 2842; AVX2: # %bb.0: 2843; AVX2-NEXT: vmovdqa (%rdi), %ymm3 2844; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 2845; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 2846; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 2847; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 2848; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 2849; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 2850; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 2851; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 2852; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 2853; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 2854; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 2855; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 2856; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 2857; AVX2-NEXT: # ymm8 = mem[0,1,0,1] 2858; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 2859; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 2860; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] 2861; AVX2-NEXT: vmovdqa %xmm8, %xmm7 2862; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 2863; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 2864; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 2865; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 2866; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 2867; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 2868; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 2869; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 2870; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 2871; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 2872; AVX2-NEXT: # ymm12 = mem[0,1,0,1] 2873; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 2874; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 2875; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 2876; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 2877; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 2878; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 2879; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 2880; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 2881; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 2882; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 2883; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 2884; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 2885; AVX2-NEXT: # ymm13 = mem[0,1,0,1] 2886; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 2887; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 2888; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 2889; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 2890; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 2891; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 2892; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 2893; AVX2-NEXT: # ymm13 = mem[0,1,0,1] 2894; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 2895; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 2896; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 2897; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm10 2898; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 2899; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10 2900; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 2901; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 2902; AVX2-NEXT: vmovdqa 144(%rdi), %xmm8 2903; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] 2904; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 2905; AVX2-NEXT: vmovdqa 128(%rdi), %xmm3 2906; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 2907; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 2908; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 2909; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] 2910; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 2911; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 2912; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 2913; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 2914; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 2915; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 2916; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] 2917; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 2918; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 2919; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2920; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 2921; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 2922; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm6 2923; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] 2924; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] 2925; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6 2926; AVX2-NEXT: vmovdqa 128(%rdi), %ymm9 2927; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 2928; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] 2929; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 2930; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] 2931; AVX2-NEXT: vpermd %ymm6, %ymm9, %ymm6 2932; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 2933; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] 2934; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 2935; AVX2-NEXT: vpor %xmm2, %xmm6, %xmm2 2936; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 2937; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] 2938; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 2939; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] 2940; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 2941; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 2942; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 2943; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] 2944; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] 2945; AVX2-NEXT: vmovdqa %ymm4, (%rsi) 2946; AVX2-NEXT: vmovdqa %ymm2, (%rdx) 2947; AVX2-NEXT: vmovdqa %ymm3, (%rcx) 2948; AVX2-NEXT: vmovdqa %ymm1, (%r8) 2949; AVX2-NEXT: vmovdqa %ymm0, (%r9) 2950; AVX2-NEXT: vzeroupper 2951; AVX2-NEXT: retq 2952; 2953; AVX2-FP-LABEL: load_i8_stride5_vf32: 2954; AVX2-FP: # %bb.0: 2955; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 2956; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 2957; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 2958; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 2959; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 2960; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 2961; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 2962; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 2963; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 2964; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 2965; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 2966; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 2967; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 2968; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 2969; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] 2970; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 2971; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 2972; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] 2973; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm7 2974; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 2975; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 2976; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 2977; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 2978; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 2979; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 2980; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 2981; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 2982; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 2983; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 2984; AVX2-FP-NEXT: # ymm12 = mem[0,1,0,1] 2985; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 2986; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 2987; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 2988; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 2989; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 2990; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 2991; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 2992; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9 2993; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 2994; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 2995; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 2996; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 2997; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] 2998; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 2999; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 3000; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 3001; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 3002; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 3003; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3004; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 3005; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] 3006; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 3007; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 3008; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 3009; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm10 3010; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 3011; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10 3012; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3013; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 3014; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm8 3015; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] 3016; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 3017; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm3 3018; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3019; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 3020; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3021; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] 3022; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 3023; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3024; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3025; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 3026; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] 3027; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3028; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] 3029; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3030; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 3031; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3032; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3033; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 3034; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm6 3035; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] 3036; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] 3037; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6 3038; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9 3039; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3040; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] 3041; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3042; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] 3043; AVX2-FP-NEXT: vpermd %ymm6, %ymm9, %ymm6 3044; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 3045; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] 3046; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3047; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2 3048; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3049; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] 3050; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 3051; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] 3052; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3053; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 3054; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3055; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] 3056; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] 3057; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi) 3058; AVX2-FP-NEXT: vmovdqa %ymm2, (%rdx) 3059; AVX2-FP-NEXT: vmovdqa %ymm3, (%rcx) 3060; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8) 3061; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) 3062; AVX2-FP-NEXT: vzeroupper 3063; AVX2-FP-NEXT: retq 3064; 3065; AVX2-FCP-LABEL: load_i8_stride5_vf32: 3066; AVX2-FCP: # %bb.0: 3067; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 3068; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 3069; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3070; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3071; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 3072; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 3073; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3074; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 3075; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 3076; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3077; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 3078; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 3079; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 3080; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 3081; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 3082; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 3083; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 3084; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] 3085; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm7 3086; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 3087; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 3088; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] 3089; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 3090; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] 3091; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 3092; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 3093; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 3094; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 3095; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 3096; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] 3097; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 3098; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 3099; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 3100; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 3101; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 3102; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 3103; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 3104; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 3105; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 3106; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 3107; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3108; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 3109; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] 3110; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 3111; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 3112; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 3113; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 3114; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 3115; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3116; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 3117; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] 3118; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 3119; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 3120; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 3121; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 3122; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 3123; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 3124; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3125; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 3126; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm8 3127; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] 3128; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 3129; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm3 3130; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3131; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 3132; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3133; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] 3134; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 3135; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3136; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3137; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 3138; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 3139; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3140; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] 3141; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3142; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 3143; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3144; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3145; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 3146; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6 3147; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] 3148; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] 3149; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 3150; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 3151; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3152; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] 3153; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3154; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] 3155; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 3156; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 3157; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] 3158; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3159; AVX2-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 3160; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3161; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] 3162; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 3163; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] 3164; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3165; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 3166; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3167; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] 3168; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] 3169; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rsi) 3170; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rdx) 3171; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rcx) 3172; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) 3173; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) 3174; AVX2-FCP-NEXT: vzeroupper 3175; AVX2-FCP-NEXT: retq 3176; 3177; AVX512-LABEL: load_i8_stride5_vf32: 3178; AVX512: # %bb.0: 3179; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 3180; AVX512-NEXT: vmovdqa (%rdi), %ymm3 3181; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 3182; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 3183; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1 3184; AVX512-NEXT: vmovdqa %ymm2, %ymm4 3185; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3186; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] 3187; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) 3188; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 3189; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 3190; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 3191; AVX512-NEXT: vmovdqa %ymm4, %ymm7 3192; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5)) 3193; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 3194; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 3195; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] 3196; AVX512-NEXT: vpor %xmm7, %xmm9, %xmm9 3197; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 3198; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6 3199; AVX512-NEXT: vmovdqa 144(%rdi), %xmm7 3200; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm6 3201; AVX512-NEXT: vmovdqa 128(%rdi), %xmm8 3202; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3203; AVX512-NEXT: vpor %xmm6, %xmm10, %xmm6 3204; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3205; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] 3206; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] 3207; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 3208; AVX512-NEXT: vmovdqa %ymm10, %ymm9 3209; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0)) 3210; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] 3211; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9)) 3212; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 3213; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12 3214; AVX512-NEXT: vmovdqa %ymm2, %ymm13 3215; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5)) 3216; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 3217; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 3218; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 3219; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13 3220; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12 3221; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm9 3222; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3223; AVX512-NEXT: vpor %xmm9, %xmm12, %xmm9 3224; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3225; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] 3226; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 3227; AVX512-NEXT: vmovdqa %ymm4, %ymm12 3228; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) 3229; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] 3230; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) 3231; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 3232; AVX512-NEXT: vpshufb %ymm12, %ymm13, %ymm13 3233; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) 3234; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14 3235; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] 3236; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] 3237; AVX512-NEXT: vpor %xmm14, %xmm10, %xmm10 3238; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 3239; AVX512-NEXT: vpshufb %xmm12, %xmm7, %xmm11 3240; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3241; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 3242; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 3243; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] 3244; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3245; AVX512-NEXT: vmovdqa %ymm2, %ymm11 3246; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0)) 3247; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3248; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) 3249; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3250; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm12 3251; AVX512-NEXT: vmovdqa %ymm4, %ymm13 3252; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) 3253; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] 3254; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 3255; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] 3256; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13 3257; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12 3258; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7 3259; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3260; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 3261; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3262; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3263; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) 3264; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) 3265; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 3266; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3267; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3268; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 3269; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3270; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] 3271; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) 3272; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3273; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3274; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 3275; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3276; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3277; AVX512-NEXT: vpermd %ymm1, %ymm2, %ymm1 3278; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) 3279; AVX512-NEXT: vmovdqa %ymm6, (%rsi) 3280; AVX512-NEXT: vmovdqa %ymm9, (%rdx) 3281; AVX512-NEXT: vmovdqa %ymm10, (%rcx) 3282; AVX512-NEXT: vmovdqa %ymm7, (%r8) 3283; AVX512-NEXT: vmovdqa %ymm1, (%r9) 3284; AVX512-NEXT: vzeroupper 3285; AVX512-NEXT: retq 3286; 3287; AVX512-FCP-LABEL: load_i8_stride5_vf32: 3288; AVX512-FCP: # %bb.0: 3289; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 3290; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 3291; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 3292; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3293; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3294; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm4 3295; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3296; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] 3297; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) 3298; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 3299; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 3300; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 3301; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7 3302; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5)) 3303; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 3304; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 3305; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] 3306; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 3307; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 3308; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6 3309; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm7 3310; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm6 3311; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 3312; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3313; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 3314; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3315; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] 3316; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] 3317; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 3318; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm9 3319; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0)) 3320; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] 3321; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9)) 3322; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 3323; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12 3324; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm13 3325; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5)) 3326; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 3327; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 3328; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 3329; AVX512-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 3330; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12 3331; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9 3332; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3333; AVX512-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 3334; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3335; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] 3336; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 3337; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12 3338; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) 3339; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] 3340; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) 3341; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 3342; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13 3343; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) 3344; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 3345; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] 3346; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] 3347; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10 3348; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 3349; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11 3350; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3351; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 3352; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 3353; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] 3354; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3355; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11 3356; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0)) 3357; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3358; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) 3359; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3360; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 3361; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm13 3362; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) 3363; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] 3364; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 3365; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] 3366; AVX512-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 3367; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12 3368; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 3369; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3370; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3371; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3372; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3373; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) 3374; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) 3375; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 3376; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3377; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3378; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 3379; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3380; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] 3381; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) 3382; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3383; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3384; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 3385; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3386; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3387; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 3388; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) 3389; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi) 3390; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rdx) 3391; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rcx) 3392; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) 3393; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9) 3394; AVX512-FCP-NEXT: vzeroupper 3395; AVX512-FCP-NEXT: retq 3396; 3397; AVX512DQ-LABEL: load_i8_stride5_vf32: 3398; AVX512DQ: # %bb.0: 3399; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 3400; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 3401; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 3402; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 3403; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1 3404; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm4 3405; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3406; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] 3407; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) 3408; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 3409; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 3410; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 3411; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7 3412; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5)) 3413; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm9 3414; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 3415; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] 3416; AVX512DQ-NEXT: vpor %xmm7, %xmm9, %xmm9 3417; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 3418; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6 3419; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm7 3420; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm6 3421; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm8 3422; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3423; AVX512DQ-NEXT: vpor %xmm6, %xmm10, %xmm6 3424; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3425; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] 3426; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] 3427; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 3428; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm9 3429; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0)) 3430; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] 3431; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9)) 3432; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 3433; AVX512DQ-NEXT: vpshufb %ymm9, %ymm12, %ymm12 3434; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm13 3435; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5)) 3436; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 3437; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 3438; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 3439; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13 3440; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12 3441; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm9 3442; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3443; AVX512DQ-NEXT: vpor %xmm9, %xmm12, %xmm9 3444; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3445; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] 3446; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 3447; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm12 3448; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) 3449; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] 3450; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) 3451; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 3452; AVX512DQ-NEXT: vpshufb %ymm12, %ymm13, %ymm13 3453; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) 3454; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm14 3455; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] 3456; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] 3457; AVX512DQ-NEXT: vpor %xmm14, %xmm10, %xmm10 3458; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 3459; AVX512DQ-NEXT: vpshufb %xmm12, %xmm7, %xmm11 3460; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3461; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 3462; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 3463; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] 3464; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3465; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11 3466; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0)) 3467; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3468; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) 3469; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3470; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm12 3471; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm13 3472; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) 3473; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] 3474; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 3475; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] 3476; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13 3477; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12 3478; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7 3479; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3480; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 3481; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3482; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3483; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) 3484; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) 3485; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 3486; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3487; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3488; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 3489; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3490; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] 3491; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) 3492; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3493; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3494; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 3495; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3496; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3497; AVX512DQ-NEXT: vpermd %ymm1, %ymm2, %ymm1 3498; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) 3499; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi) 3500; AVX512DQ-NEXT: vmovdqa %ymm9, (%rdx) 3501; AVX512DQ-NEXT: vmovdqa %ymm10, (%rcx) 3502; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8) 3503; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9) 3504; AVX512DQ-NEXT: vzeroupper 3505; AVX512DQ-NEXT: retq 3506; 3507; AVX512DQ-FCP-LABEL: load_i8_stride5_vf32: 3508; AVX512DQ-FCP: # %bb.0: 3509; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 3510; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 3511; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 3512; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3513; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3514; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm4 3515; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3516; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] 3517; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) 3518; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 3519; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 3520; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 3521; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7 3522; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5)) 3523; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 3524; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 3525; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] 3526; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 3527; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 3528; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6 3529; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm7 3530; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm6 3531; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 3532; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3533; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 3534; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3535; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] 3536; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] 3537; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 3538; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm9 3539; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0)) 3540; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] 3541; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9)) 3542; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 3543; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12 3544; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm13 3545; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5)) 3546; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 3547; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 3548; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 3549; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 3550; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12 3551; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9 3552; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3553; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 3554; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3555; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] 3556; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 3557; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm12 3558; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) 3559; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] 3560; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) 3561; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 3562; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13 3563; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) 3564; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 3565; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] 3566; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] 3567; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10 3568; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 3569; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11 3570; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3571; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 3572; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 3573; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] 3574; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] 3575; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 3576; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0)) 3577; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3578; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) 3579; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3580; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 3581; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm13 3582; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) 3583; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] 3584; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 3585; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] 3586; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 3587; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12 3588; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 3589; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3590; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3591; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3592; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 3593; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) 3594; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) 3595; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 3596; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3597; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3598; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 3599; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 3600; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] 3601; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) 3602; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3603; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3604; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 3605; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3606; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3607; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 3608; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) 3609; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi) 3610; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rdx) 3611; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rcx) 3612; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) 3613; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9) 3614; AVX512DQ-FCP-NEXT: vzeroupper 3615; AVX512DQ-FCP-NEXT: retq 3616; 3617; AVX512BW-LABEL: load_i8_stride5_vf32: 3618; AVX512BW: # %bb.0: 3619; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 3620; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 3621; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 3622; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1 3623; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294 3624; AVX512BW-NEXT: kmovd %eax, %k1 3625; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} 3626; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 3627; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 3628; AVX512BW-NEXT: kmovd %eax, %k2 3629; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} 3630; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52 3631; AVX512BW-NEXT: kmovd %eax, %k2 3632; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} 3633; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 3634; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 3635; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 3636; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 3637; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 3638; AVX512BW-NEXT: kmovd %eax, %k3 3639; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 3640; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm6 3641; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] 3642; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm7 3643; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3644; AVX512BW-NEXT: vpor %xmm4, %xmm8, %xmm4 3645; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3646; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] 3647; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 3648; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A 3649; AVX512BW-NEXT: kmovd %eax, %k4 3650; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4} 3651; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 3652; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 3653; AVX512BW-NEXT: kmovd %eax, %k5 3654; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5} 3655; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1} 3656; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] 3657; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 3658; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] 3659; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3660; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 3661; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12] 3662; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3663; AVX512BW-NEXT: vpor %xmm5, %xmm9, %xmm5 3664; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 3665; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] 3666; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] 3667; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} 3668; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] 3669; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 3670; AVX512BW-NEXT: kmovd %eax, %k5 3671; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} 3672; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} 3673; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 3674; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 3675; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 3676; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 3677; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 3678; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] 3679; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3680; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 3681; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3682; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 3683; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 3684; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1} 3685; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 3686; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 3687; AVX512BW-NEXT: kmovd %eax, %k3 3688; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3} 3689; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} 3690; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 3691; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10 3692; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 3693; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3694; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 3695; AVX512BW-NEXT: kmovd %eax, %k3 3696; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3697; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] 3698; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3699; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 3700; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3701; AVX512BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000 3702; AVX512BW-NEXT: kmovd %eax, %k3 3703; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} 3704; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} 3705; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 3706; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3707; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3708; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 3709; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} 3710; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3711; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000 3712; AVX512BW-NEXT: kmovd %eax, %k1 3713; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 3714; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3715; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3716; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 3717; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3718; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3719; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm1 3720; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} 3721; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) 3722; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) 3723; AVX512BW-NEXT: vmovdqa %ymm8, (%rcx) 3724; AVX512BW-NEXT: vmovdqa %ymm10, (%r8) 3725; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) 3726; AVX512BW-NEXT: vzeroupper 3727; AVX512BW-NEXT: retq 3728; 3729; AVX512BW-FCP-LABEL: load_i8_stride5_vf32: 3730; AVX512BW-FCP: # %bb.0: 3731; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 3732; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 3733; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3734; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3735; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 3736; AVX512BW-FCP-NEXT: kmovd %eax, %k1 3737; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} 3738; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 3739; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 3740; AVX512BW-FCP-NEXT: kmovd %eax, %k2 3741; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} 3742; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 3743; AVX512BW-FCP-NEXT: kmovd %eax, %k2 3744; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} 3745; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3746; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 3747; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 3748; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3749; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 3750; AVX512BW-FCP-NEXT: kmovd %eax, %k3 3751; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 3752; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm6 3753; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] 3754; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm7 3755; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3756; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 3757; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3758; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] 3759; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 3760; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 3761; AVX512BW-FCP-NEXT: kmovd %eax, %k4 3762; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4} 3763; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 3764; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 3765; AVX512BW-FCP-NEXT: kmovd %eax, %k5 3766; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5} 3767; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1} 3768; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] 3769; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 3770; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] 3771; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3772; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 3773; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12] 3774; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3775; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 3776; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 3777; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] 3778; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] 3779; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} 3780; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] 3781; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 3782; AVX512BW-FCP-NEXT: kmovd %eax, %k5 3783; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} 3784; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} 3785; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 3786; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 3787; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 3788; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 3789; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 3790; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] 3791; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3792; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 3793; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3794; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 3795; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 3796; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1} 3797; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 3798; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 3799; AVX512BW-FCP-NEXT: kmovd %eax, %k3 3800; AVX512BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3} 3801; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} 3802; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 3803; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 3804; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 3805; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3806; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 3807; AVX512BW-FCP-NEXT: kmovd %eax, %k3 3808; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3809; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] 3810; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3811; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 3812; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3813; AVX512BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000 3814; AVX512BW-FCP-NEXT: kmovd %eax, %k3 3815; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} 3816; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} 3817; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 3818; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3819; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3820; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 3821; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} 3822; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3823; AVX512BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000 3824; AVX512BW-FCP-NEXT: kmovd %eax, %k1 3825; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 3826; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3827; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3828; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 3829; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3830; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3831; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 3832; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} 3833; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 3834; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 3835; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rcx) 3836; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r8) 3837; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 3838; AVX512BW-FCP-NEXT: vzeroupper 3839; AVX512BW-FCP-NEXT: retq 3840; 3841; AVX512DQ-BW-LABEL: load_i8_stride5_vf32: 3842; AVX512DQ-BW: # %bb.0: 3843; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 3844; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 3845; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0 3846; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm1 3847; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294 3848; AVX512DQ-BW-NEXT: kmovd %eax, %k1 3849; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} 3850; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 3851; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 3852; AVX512DQ-BW-NEXT: kmovd %eax, %k2 3853; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} 3854; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52 3855; AVX512DQ-BW-NEXT: kmovd %eax, %k2 3856; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} 3857; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 3858; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 3859; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 3860; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 3861; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 3862; AVX512DQ-BW-NEXT: kmovd %eax, %k3 3863; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 3864; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm6 3865; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] 3866; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm7 3867; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3868; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm8, %xmm4 3869; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3870; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] 3871; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 3872; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A 3873; AVX512DQ-BW-NEXT: kmovd %eax, %k4 3874; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4} 3875; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 3876; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 3877; AVX512DQ-BW-NEXT: kmovd %eax, %k5 3878; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5} 3879; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1} 3880; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] 3881; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 3882; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] 3883; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3884; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 3885; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12] 3886; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3887; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm9, %xmm5 3888; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 3889; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] 3890; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] 3891; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} 3892; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] 3893; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 3894; AVX512DQ-BW-NEXT: kmovd %eax, %k5 3895; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} 3896; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} 3897; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm10 3898; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 3899; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 3900; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm9, %xmm9 3901; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 3902; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] 3903; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 3904; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 3905; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3906; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 3907; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 3908; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1} 3909; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 3910; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 3911; AVX512DQ-BW-NEXT: kmovd %eax, %k3 3912; AVX512DQ-BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3} 3913; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} 3914; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 3915; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm10 3916; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 3917; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3918; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 3919; AVX512DQ-BW-NEXT: kmovd %eax, %k3 3920; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 3921; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] 3922; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 3923; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 3924; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3925; AVX512DQ-BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000 3926; AVX512DQ-BW-NEXT: kmovd %eax, %k3 3927; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} 3928; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} 3929; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 3930; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 3931; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 3932; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 3933; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} 3934; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3935; AVX512DQ-BW-NEXT: movl $554172416, %eax # imm = 0x21080000 3936; AVX512DQ-BW-NEXT: kmovd %eax, %k1 3937; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 3938; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 3939; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3940; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 3941; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 3942; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 3943; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm2, %ymm1 3944; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} 3945; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) 3946; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) 3947; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rcx) 3948; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r8) 3949; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r9) 3950; AVX512DQ-BW-NEXT: vzeroupper 3951; AVX512DQ-BW-NEXT: retq 3952; 3953; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf32: 3954; AVX512DQ-BW-FCP: # %bb.0: 3955; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 3956; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 3957; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3958; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3959; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 3960; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 3961; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} 3962; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 3963; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 3964; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 3965; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} 3966; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 3967; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 3968; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} 3969; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3970; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 3971; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 3972; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3973; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 3974; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 3975; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 3976; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm6 3977; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] 3978; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm7 3979; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 3980; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 3981; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3982; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] 3983; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 3984; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 3985; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 3986; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4} 3987; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 3988; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 3989; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 3990; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5} 3991; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1} 3992; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] 3993; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 3994; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] 3995; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3996; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 3997; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12] 3998; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 3999; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 4000; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 4001; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] 4002; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] 4003; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} 4004; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] 4005; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 4006; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 4007; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} 4008; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} 4009; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 4010; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] 4011; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] 4012; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 4013; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 4014; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] 4015; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 4016; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 4017; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 4018; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 4019; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 4020; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1} 4021; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 4022; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 4023; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 4024; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3} 4025; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} 4026; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] 4027; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 4028; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] 4029; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 4030; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 4031; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 4032; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 4033; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] 4034; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 4035; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 4036; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4037; AVX512DQ-BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000 4038; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 4039; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} 4040; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} 4041; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 4042; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 4043; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 4044; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 4045; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} 4046; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 4047; AVX512DQ-BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000 4048; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 4049; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 4050; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 4051; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 4052; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 4053; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 4054; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] 4055; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 4056; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} 4057; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) 4058; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 4059; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rcx) 4060; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r8) 4061; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 4062; AVX512DQ-BW-FCP-NEXT: vzeroupper 4063; AVX512DQ-BW-FCP-NEXT: retq 4064 %wide.vec = load <160 x i8>, ptr %in.vec, align 64 4065 %strided.vec0 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155> 4066 %strided.vec1 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156> 4067 %strided.vec2 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157> 4068 %strided.vec3 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158> 4069 %strided.vec4 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159> 4070 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64 4071 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64 4072 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64 4073 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64 4074 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64 4075 ret void 4076} 4077 4078define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 4079; SSE-LABEL: load_i8_stride5_vf64: 4080; SSE: # %bb.0: 4081; SSE-NEXT: subq $552, %rsp # imm = 0x228 4082; SSE-NEXT: movdqa 160(%rdi), %xmm9 4083; SSE-NEXT: movdqa 176(%rdi), %xmm3 4084; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4085; SSE-NEXT: movdqa 208(%rdi), %xmm4 4086; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4087; SSE-NEXT: movdqa 192(%rdi), %xmm1 4088; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4089; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 4090; SSE-NEXT: movdqa %xmm2, %xmm0 4091; SSE-NEXT: pandn %xmm1, %xmm0 4092; SSE-NEXT: movdqa %xmm4, %xmm1 4093; SSE-NEXT: pand %xmm2, %xmm1 4094; SSE-NEXT: movdqa %xmm2, %xmm14 4095; SSE-NEXT: por %xmm0, %xmm1 4096; SSE-NEXT: pxor %xmm12, %xmm12 4097; SSE-NEXT: movdqa %xmm1, %xmm0 4098; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 4099; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4100; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 4101; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] 4102; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4103; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] 4104; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 4105; SSE-NEXT: packuswb %xmm1, %xmm0 4106; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] 4107; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] 4108; SSE-NEXT: movdqa %xmm11, %xmm1 4109; SSE-NEXT: pandn %xmm0, %xmm1 4110; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4111; SSE-NEXT: movdqa %xmm10, %xmm0 4112; SSE-NEXT: pandn %xmm3, %xmm0 4113; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4114; SSE-NEXT: movdqa %xmm2, %xmm3 4115; SSE-NEXT: movdqa %xmm2, %xmm4 4116; SSE-NEXT: pandn %xmm9, %xmm3 4117; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4118; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 4119; SSE-NEXT: movdqa %xmm7, %xmm3 4120; SSE-NEXT: pandn %xmm9, %xmm3 4121; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4122; SSE-NEXT: movdqa %xmm14, %xmm2 4123; SSE-NEXT: pandn %xmm9, %xmm2 4124; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4125; SSE-NEXT: movdqa %xmm10, %xmm2 4126; SSE-NEXT: pandn %xmm9, %xmm2 4127; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4128; SSE-NEXT: pand %xmm10, %xmm9 4129; SSE-NEXT: por %xmm0, %xmm9 4130; SSE-NEXT: movdqa %xmm9, %xmm0 4131; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 4132; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] 4133; SSE-NEXT: movdqa %xmm8, %xmm2 4134; SSE-NEXT: pandn %xmm0, %xmm2 4135; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 4136; SSE-NEXT: pand %xmm8, %xmm9 4137; SSE-NEXT: por %xmm2, %xmm9 4138; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7] 4139; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] 4140; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 4141; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 4142; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] 4143; SSE-NEXT: packuswb %xmm0, %xmm0 4144; SSE-NEXT: pand %xmm11, %xmm0 4145; SSE-NEXT: por %xmm1, %xmm0 4146; SSE-NEXT: movdqa 224(%rdi), %xmm3 4147; SSE-NEXT: movdqa %xmm3, %xmm2 4148; SSE-NEXT: pxor %xmm1, %xmm1 4149; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4150; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4151; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 4152; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4153; SSE-NEXT: pxor %xmm9, %xmm9 4154; SSE-NEXT: movdqa %xmm3, %xmm1 4155; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] 4156; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 4157; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 4158; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 4159; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 4160; SSE-NEXT: packuswb %xmm1, %xmm1 4161; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 4162; SSE-NEXT: movdqa %xmm6, %xmm2 4163; SSE-NEXT: pandn %xmm1, %xmm2 4164; SSE-NEXT: pand %xmm6, %xmm0 4165; SSE-NEXT: por %xmm0, %xmm2 4166; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4167; SSE-NEXT: movdqa 32(%rdi), %xmm1 4168; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4169; SSE-NEXT: movdqa %xmm14, %xmm0 4170; SSE-NEXT: pandn %xmm1, %xmm0 4171; SSE-NEXT: movdqa 48(%rdi), %xmm15 4172; SSE-NEXT: movdqa %xmm15, %xmm1 4173; SSE-NEXT: pand %xmm14, %xmm1 4174; SSE-NEXT: por %xmm0, %xmm1 4175; SSE-NEXT: movdqa %xmm1, %xmm0 4176; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4177; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4178; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 4179; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4180; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4181; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] 4182; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 4183; SSE-NEXT: packuswb %xmm1, %xmm0 4184; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] 4185; SSE-NEXT: movdqa %xmm11, %xmm1 4186; SSE-NEXT: pandn %xmm0, %xmm1 4187; SSE-NEXT: movdqa 16(%rdi), %xmm0 4188; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4189; SSE-NEXT: movdqa %xmm10, %xmm2 4190; SSE-NEXT: pandn %xmm0, %xmm2 4191; SSE-NEXT: movdqa (%rdi), %xmm3 4192; SSE-NEXT: movdqa %xmm4, %xmm0 4193; SSE-NEXT: pandn %xmm3, %xmm4 4194; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4195; SSE-NEXT: movdqa %xmm7, %xmm4 4196; SSE-NEXT: pandn %xmm3, %xmm4 4197; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4198; SSE-NEXT: movdqa %xmm14, %xmm4 4199; SSE-NEXT: pandn %xmm3, %xmm4 4200; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4201; SSE-NEXT: movdqa %xmm10, %xmm4 4202; SSE-NEXT: pandn %xmm3, %xmm4 4203; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4204; SSE-NEXT: pand %xmm10, %xmm3 4205; SSE-NEXT: por %xmm2, %xmm3 4206; SSE-NEXT: movdqa %xmm3, %xmm2 4207; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 4208; SSE-NEXT: movdqa %xmm8, %xmm4 4209; SSE-NEXT: pandn %xmm2, %xmm4 4210; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 4211; SSE-NEXT: pand %xmm8, %xmm3 4212; SSE-NEXT: por %xmm4, %xmm3 4213; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7] 4214; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 4215; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 4216; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] 4217; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 4218; SSE-NEXT: packuswb %xmm2, %xmm2 4219; SSE-NEXT: pand %xmm11, %xmm2 4220; SSE-NEXT: por %xmm1, %xmm2 4221; SSE-NEXT: movdqa 64(%rdi), %xmm1 4222; SSE-NEXT: movdqa %xmm1, %xmm3 4223; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4224; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4225; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4226; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4227; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] 4228; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3] 4229; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 4230; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 4231; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 4232; SSE-NEXT: packuswb %xmm1, %xmm1 4233; SSE-NEXT: movdqa %xmm6, %xmm3 4234; SSE-NEXT: pandn %xmm1, %xmm3 4235; SSE-NEXT: pand %xmm6, %xmm2 4236; SSE-NEXT: por %xmm2, %xmm3 4237; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4238; SSE-NEXT: movdqa 272(%rdi), %xmm2 4239; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4240; SSE-NEXT: movdqa %xmm14, %xmm1 4241; SSE-NEXT: pandn %xmm2, %xmm1 4242; SSE-NEXT: movdqa 288(%rdi), %xmm13 4243; SSE-NEXT: movdqa %xmm13, %xmm2 4244; SSE-NEXT: pand %xmm14, %xmm2 4245; SSE-NEXT: por %xmm1, %xmm2 4246; SSE-NEXT: movdqa %xmm2, %xmm1 4247; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 4248; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] 4249; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] 4250; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4251; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 4252; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] 4253; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 4254; SSE-NEXT: packuswb %xmm2, %xmm1 4255; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] 4256; SSE-NEXT: movdqa %xmm11, %xmm2 4257; SSE-NEXT: pandn %xmm1, %xmm2 4258; SSE-NEXT: movdqa 256(%rdi), %xmm1 4259; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 4260; SSE-NEXT: movdqa %xmm10, %xmm4 4261; SSE-NEXT: pandn %xmm1, %xmm4 4262; SSE-NEXT: movdqa 240(%rdi), %xmm3 4263; SSE-NEXT: movdqa %xmm0, %xmm1 4264; SSE-NEXT: pandn %xmm3, %xmm1 4265; SSE-NEXT: pandn %xmm3, %xmm7 4266; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4267; SSE-NEXT: movdqa %xmm14, %xmm7 4268; SSE-NEXT: pandn %xmm3, %xmm7 4269; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4270; SSE-NEXT: movdqa %xmm10, %xmm7 4271; SSE-NEXT: pandn %xmm3, %xmm7 4272; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4273; SSE-NEXT: pand %xmm10, %xmm3 4274; SSE-NEXT: por %xmm4, %xmm3 4275; SSE-NEXT: movdqa %xmm3, %xmm4 4276; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4277; SSE-NEXT: movdqa %xmm8, %xmm7 4278; SSE-NEXT: pandn %xmm4, %xmm7 4279; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 4280; SSE-NEXT: pand %xmm8, %xmm3 4281; SSE-NEXT: por %xmm7, %xmm3 4282; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] 4283; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] 4284; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 4285; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] 4286; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] 4287; SSE-NEXT: packuswb %xmm3, %xmm3 4288; SSE-NEXT: pand %xmm11, %xmm3 4289; SSE-NEXT: por %xmm2, %xmm3 4290; SSE-NEXT: movdqa 304(%rdi), %xmm2 4291; SSE-NEXT: movdqa %xmm2, %xmm4 4292; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4293; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4294; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4295; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4296; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] 4297; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3] 4298; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 4299; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 4300; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] 4301; SSE-NEXT: packuswb %xmm2, %xmm2 4302; SSE-NEXT: movdqa %xmm6, %xmm4 4303; SSE-NEXT: pandn %xmm2, %xmm4 4304; SSE-NEXT: pand %xmm6, %xmm3 4305; SSE-NEXT: por %xmm3, %xmm4 4306; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4307; SSE-NEXT: movdqa 112(%rdi), %xmm3 4308; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4309; SSE-NEXT: movdqa %xmm14, %xmm2 4310; SSE-NEXT: pandn %xmm3, %xmm2 4311; SSE-NEXT: movdqa 128(%rdi), %xmm3 4312; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4313; SSE-NEXT: pand %xmm14, %xmm3 4314; SSE-NEXT: por %xmm2, %xmm3 4315; SSE-NEXT: movdqa %xmm3, %xmm2 4316; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 4317; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3] 4318; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 4319; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 4320; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] 4321; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] 4322; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 4323; SSE-NEXT: packuswb %xmm3, %xmm2 4324; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] 4325; SSE-NEXT: movdqa %xmm11, %xmm3 4326; SSE-NEXT: pandn %xmm2, %xmm3 4327; SSE-NEXT: movdqa 96(%rdi), %xmm4 4328; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4329; SSE-NEXT: movdqa %xmm10, %xmm2 4330; SSE-NEXT: pandn %xmm4, %xmm2 4331; SSE-NEXT: movdqa 80(%rdi), %xmm4 4332; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4333; SSE-NEXT: pand %xmm10, %xmm4 4334; SSE-NEXT: por %xmm2, %xmm4 4335; SSE-NEXT: movdqa %xmm4, %xmm2 4336; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 4337; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 4338; SSE-NEXT: pand %xmm8, %xmm4 4339; SSE-NEXT: pandn %xmm2, %xmm8 4340; SSE-NEXT: por %xmm4, %xmm8 4341; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7] 4342; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 4343; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 4344; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] 4345; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] 4346; SSE-NEXT: packuswb %xmm2, %xmm2 4347; SSE-NEXT: pand %xmm11, %xmm2 4348; SSE-NEXT: por %xmm3, %xmm2 4349; SSE-NEXT: movdqa 144(%rdi), %xmm12 4350; SSE-NEXT: movdqa %xmm12, %xmm4 4351; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4352; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4353; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] 4354; SSE-NEXT: movdqa %xmm12, %xmm3 4355; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4356; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] 4357; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] 4358; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 4359; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 4360; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] 4361; SSE-NEXT: packuswb %xmm3, %xmm3 4362; SSE-NEXT: movdqa %xmm6, %xmm14 4363; SSE-NEXT: movdqa %xmm6, %xmm4 4364; SSE-NEXT: pandn %xmm3, %xmm4 4365; SSE-NEXT: pand %xmm6, %xmm2 4366; SSE-NEXT: por %xmm2, %xmm4 4367; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4368; SSE-NEXT: movdqa %xmm10, %xmm2 4369; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4370; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4371; SSE-NEXT: pand %xmm10, %xmm3 4372; SSE-NEXT: por %xmm2, %xmm3 4373; SSE-NEXT: movdqa %xmm3, %xmm2 4374; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4375; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4376; SSE-NEXT: movdqa %xmm3, %xmm4 4377; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] 4378; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] 4379; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] 4380; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] 4381; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] 4382; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 4383; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 4384; SSE-NEXT: psllq $48, %xmm3 4385; SSE-NEXT: packuswb %xmm2, %xmm3 4386; SSE-NEXT: movdqa %xmm11, %xmm4 4387; SSE-NEXT: pandn %xmm3, %xmm4 4388; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4389; SSE-NEXT: movdqa %xmm6, %xmm3 4390; SSE-NEXT: pand %xmm0, %xmm3 4391; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4392; SSE-NEXT: movdqa %xmm3, %xmm7 4393; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] 4394; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0] 4395; SSE-NEXT: movdqa %xmm2, %xmm8 4396; SSE-NEXT: pandn %xmm7, %xmm8 4397; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4398; SSE-NEXT: pand %xmm2, %xmm3 4399; SSE-NEXT: por %xmm8, %xmm3 4400; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] 4401; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 4402; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 4403; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] 4404; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7] 4405; SSE-NEXT: packuswb %xmm3, %xmm3 4406; SSE-NEXT: pand %xmm11, %xmm3 4407; SSE-NEXT: por %xmm4, %xmm3 4408; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4409; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4410; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] 4411; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] 4412; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7] 4413; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 4414; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] 4415; SSE-NEXT: packuswb %xmm4, %xmm4 4416; SSE-NEXT: movdqa %xmm14, %xmm7 4417; SSE-NEXT: pandn %xmm4, %xmm7 4418; SSE-NEXT: pand %xmm14, %xmm3 4419; SSE-NEXT: por %xmm3, %xmm7 4420; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4421; SSE-NEXT: movdqa %xmm10, %xmm3 4422; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 4423; SSE-NEXT: movdqa %xmm15, %xmm4 4424; SSE-NEXT: pand %xmm10, %xmm4 4425; SSE-NEXT: movdqa %xmm10, %xmm5 4426; SSE-NEXT: por %xmm3, %xmm4 4427; SSE-NEXT: movdqa %xmm4, %xmm3 4428; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 4429; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4430; SSE-NEXT: movdqa %xmm4, %xmm7 4431; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] 4432; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] 4433; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] 4434; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7] 4435; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] 4436; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] 4437; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 4438; SSE-NEXT: psllq $48, %xmm4 4439; SSE-NEXT: packuswb %xmm3, %xmm4 4440; SSE-NEXT: movdqa %xmm11, %xmm3 4441; SSE-NEXT: pandn %xmm4, %xmm3 4442; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4443; SSE-NEXT: movdqa %xmm10, %xmm4 4444; SSE-NEXT: movdqa %xmm0, %xmm8 4445; SSE-NEXT: pand %xmm0, %xmm4 4446; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4447; SSE-NEXT: movdqa %xmm4, %xmm0 4448; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] 4449; SSE-NEXT: movdqa %xmm2, %xmm7 4450; SSE-NEXT: pandn %xmm0, %xmm7 4451; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4452; SSE-NEXT: pand %xmm2, %xmm4 4453; SSE-NEXT: por %xmm7, %xmm4 4454; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] 4455; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 4456; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 4457; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 4458; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] 4459; SSE-NEXT: packuswb %xmm0, %xmm0 4460; SSE-NEXT: pand %xmm11, %xmm0 4461; SSE-NEXT: por %xmm3, %xmm0 4462; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4463; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4464; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] 4465; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] 4466; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] 4467; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 4468; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] 4469; SSE-NEXT: packuswb %xmm3, %xmm3 4470; SSE-NEXT: movdqa %xmm14, %xmm4 4471; SSE-NEXT: pandn %xmm3, %xmm4 4472; SSE-NEXT: pand %xmm14, %xmm0 4473; SSE-NEXT: por %xmm0, %xmm4 4474; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4475; SSE-NEXT: movdqa %xmm5, %xmm0 4476; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4477; SSE-NEXT: movdqa %xmm13, %xmm3 4478; SSE-NEXT: pand %xmm5, %xmm3 4479; SSE-NEXT: por %xmm0, %xmm3 4480; SSE-NEXT: movdqa %xmm3, %xmm0 4481; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] 4482; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4483; SSE-NEXT: movdqa %xmm3, %xmm4 4484; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] 4485; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] 4486; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] 4487; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] 4488; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] 4489; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 4490; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 4491; SSE-NEXT: psllq $48, %xmm3 4492; SSE-NEXT: packuswb %xmm0, %xmm3 4493; SSE-NEXT: movdqa %xmm11, %xmm0 4494; SSE-NEXT: pandn %xmm3, %xmm0 4495; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 4496; SSE-NEXT: pand %xmm8, %xmm3 4497; SSE-NEXT: movdqa %xmm8, %xmm7 4498; SSE-NEXT: por %xmm1, %xmm3 4499; SSE-NEXT: movdqa %xmm3, %xmm1 4500; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4501; SSE-NEXT: movdqa %xmm2, %xmm4 4502; SSE-NEXT: pandn %xmm1, %xmm4 4503; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4504; SSE-NEXT: pand %xmm2, %xmm3 4505; SSE-NEXT: por %xmm4, %xmm3 4506; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] 4507; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4508; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 4509; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 4510; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] 4511; SSE-NEXT: packuswb %xmm1, %xmm1 4512; SSE-NEXT: pand %xmm11, %xmm1 4513; SSE-NEXT: por %xmm0, %xmm1 4514; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4515; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4516; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] 4517; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] 4518; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7] 4519; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4520; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 4521; SSE-NEXT: packuswb %xmm0, %xmm0 4522; SSE-NEXT: movdqa %xmm14, %xmm3 4523; SSE-NEXT: pandn %xmm0, %xmm3 4524; SSE-NEXT: pand %xmm14, %xmm1 4525; SSE-NEXT: por %xmm1, %xmm3 4526; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4527; SSE-NEXT: movdqa %xmm5, %xmm1 4528; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4529; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4530; SSE-NEXT: pand %xmm5, %xmm0 4531; SSE-NEXT: movdqa %xmm5, %xmm8 4532; SSE-NEXT: por %xmm1, %xmm0 4533; SSE-NEXT: movdqa %xmm0, %xmm1 4534; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4535; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4536; SSE-NEXT: movdqa %xmm0, %xmm3 4537; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] 4538; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 4539; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] 4540; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] 4541; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] 4542; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 4543; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 4544; SSE-NEXT: psllq $48, %xmm0 4545; SSE-NEXT: packuswb %xmm1, %xmm0 4546; SSE-NEXT: movdqa %xmm7, %xmm1 4547; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4548; SSE-NEXT: pandn %xmm5, %xmm1 4549; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4550; SSE-NEXT: pand %xmm7, %xmm3 4551; SSE-NEXT: por %xmm1, %xmm3 4552; SSE-NEXT: movdqa %xmm3, %xmm1 4553; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4554; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 4555; SSE-NEXT: pand %xmm2, %xmm3 4556; SSE-NEXT: pandn %xmm1, %xmm2 4557; SSE-NEXT: por %xmm3, %xmm2 4558; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] 4559; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4560; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 4561; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 4562; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] 4563; SSE-NEXT: packuswb %xmm1, %xmm1 4564; SSE-NEXT: pand %xmm11, %xmm1 4565; SSE-NEXT: pandn %xmm0, %xmm11 4566; SSE-NEXT: por %xmm11, %xmm1 4567; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4568; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] 4569; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] 4570; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7] 4571; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4572; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 4573; SSE-NEXT: packuswb %xmm0, %xmm0 4574; SSE-NEXT: movdqa %xmm14, %xmm2 4575; SSE-NEXT: pandn %xmm0, %xmm2 4576; SSE-NEXT: pand %xmm14, %xmm1 4577; SSE-NEXT: por %xmm1, %xmm2 4578; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4579; SSE-NEXT: movdqa %xmm6, %xmm1 4580; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 4581; SSE-NEXT: pand %xmm11, %xmm1 4582; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4583; SSE-NEXT: movdqa %xmm1, %xmm2 4584; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4585; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] 4586; SSE-NEXT: movdqa %xmm6, %xmm3 4587; SSE-NEXT: pandn %xmm2, %xmm3 4588; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 4589; SSE-NEXT: pand %xmm6, %xmm1 4590; SSE-NEXT: por %xmm3, %xmm1 4591; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4592; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4593; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 4594; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4595; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 4596; SSE-NEXT: packuswb %xmm1, %xmm1 4597; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] 4598; SSE-NEXT: movdqa %xmm12, %xmm2 4599; SSE-NEXT: pandn %xmm1, %xmm2 4600; SSE-NEXT: movdqa %xmm8, %xmm1 4601; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4602; SSE-NEXT: movdqa %xmm7, %xmm0 4603; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4604; SSE-NEXT: pandn %xmm4, %xmm0 4605; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4606; SSE-NEXT: movdqa %xmm11, %xmm3 4607; SSE-NEXT: pandn %xmm4, %xmm3 4608; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4609; SSE-NEXT: pand %xmm8, %xmm4 4610; SSE-NEXT: por %xmm1, %xmm4 4611; SSE-NEXT: movdqa %xmm4, %xmm1 4612; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4613; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4614; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] 4615; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] 4616; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] 4617; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4618; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4619; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4620; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4621; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 4622; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] 4623; SSE-NEXT: packuswb %xmm1, %xmm4 4624; SSE-NEXT: pand %xmm12, %xmm4 4625; SSE-NEXT: por %xmm2, %xmm4 4626; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4627; SSE-NEXT: # xmm1 = mem[1,1,1,1] 4628; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4629; SSE-NEXT: # xmm2 = mem[0,2,2,3] 4630; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4631; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 4632; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4633; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4634; SSE-NEXT: packuswb %xmm1, %xmm1 4635; SSE-NEXT: movdqa %xmm14, %xmm3 4636; SSE-NEXT: pandn %xmm1, %xmm3 4637; SSE-NEXT: pand %xmm14, %xmm4 4638; SSE-NEXT: por %xmm4, %xmm3 4639; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4640; SSE-NEXT: pand %xmm11, %xmm10 4641; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 4642; SSE-NEXT: movdqa %xmm10, %xmm2 4643; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4644; SSE-NEXT: movdqa %xmm6, %xmm4 4645; SSE-NEXT: pandn %xmm2, %xmm4 4646; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 4647; SSE-NEXT: pand %xmm6, %xmm10 4648; SSE-NEXT: por %xmm4, %xmm10 4649; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7] 4650; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4651; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 4652; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4653; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 4654; SSE-NEXT: packuswb %xmm1, %xmm1 4655; SSE-NEXT: movdqa %xmm12, %xmm2 4656; SSE-NEXT: pandn %xmm1, %xmm2 4657; SSE-NEXT: movdqa %xmm8, %xmm1 4658; SSE-NEXT: pandn %xmm15, %xmm1 4659; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4660; SSE-NEXT: movdqa %xmm10, %xmm0 4661; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4662; SSE-NEXT: pandn %xmm4, %xmm0 4663; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4664; SSE-NEXT: movdqa %xmm11, %xmm3 4665; SSE-NEXT: pandn %xmm4, %xmm3 4666; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4667; SSE-NEXT: pand %xmm8, %xmm4 4668; SSE-NEXT: por %xmm1, %xmm4 4669; SSE-NEXT: movdqa %xmm4, %xmm1 4670; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4671; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4672; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] 4673; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] 4674; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] 4675; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4676; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4677; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4678; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4679; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 4680; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] 4681; SSE-NEXT: packuswb %xmm1, %xmm4 4682; SSE-NEXT: pand %xmm12, %xmm4 4683; SSE-NEXT: por %xmm2, %xmm4 4684; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4685; SSE-NEXT: # xmm1 = mem[1,1,1,1] 4686; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4687; SSE-NEXT: # xmm2 = mem[0,2,2,3] 4688; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 4689; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 4690; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4691; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 4692; SSE-NEXT: packuswb %xmm1, %xmm1 4693; SSE-NEXT: movdqa %xmm14, %xmm2 4694; SSE-NEXT: pandn %xmm1, %xmm2 4695; SSE-NEXT: pand %xmm14, %xmm4 4696; SSE-NEXT: por %xmm4, %xmm2 4697; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4698; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 4699; SSE-NEXT: pand %xmm11, %xmm1 4700; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4701; SSE-NEXT: movdqa %xmm1, %xmm2 4702; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4703; SSE-NEXT: movdqa %xmm6, %xmm4 4704; SSE-NEXT: pandn %xmm2, %xmm4 4705; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 4706; SSE-NEXT: pand %xmm6, %xmm1 4707; SSE-NEXT: por %xmm4, %xmm1 4708; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4709; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4710; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 4711; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4712; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 4713; SSE-NEXT: packuswb %xmm1, %xmm1 4714; SSE-NEXT: movdqa %xmm12, %xmm2 4715; SSE-NEXT: pandn %xmm1, %xmm2 4716; SSE-NEXT: movdqa %xmm8, %xmm4 4717; SSE-NEXT: pandn %xmm13, %xmm4 4718; SSE-NEXT: movdqa %xmm10, %xmm0 4719; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4720; SSE-NEXT: pandn %xmm7, %xmm0 4721; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4722; SSE-NEXT: movdqa %xmm11, %xmm1 4723; SSE-NEXT: pandn %xmm7, %xmm1 4724; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4725; SSE-NEXT: pand %xmm8, %xmm7 4726; SSE-NEXT: movdqa %xmm8, %xmm10 4727; SSE-NEXT: por %xmm4, %xmm7 4728; SSE-NEXT: movdqa %xmm7, %xmm4 4729; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 4730; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 4731; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0] 4732; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0] 4733; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2] 4734; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 4735; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 4736; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 4737; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] 4738; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] 4739; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5] 4740; SSE-NEXT: packuswb %xmm4, %xmm7 4741; SSE-NEXT: pand %xmm12, %xmm7 4742; SSE-NEXT: por %xmm2, %xmm7 4743; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4744; SSE-NEXT: # xmm2 = mem[1,1,1,1] 4745; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4746; SSE-NEXT: # xmm4 = mem[0,2,2,3] 4747; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 4748; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] 4749; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 4750; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 4751; SSE-NEXT: packuswb %xmm2, %xmm2 4752; SSE-NEXT: movdqa %xmm14, %xmm1 4753; SSE-NEXT: pandn %xmm2, %xmm1 4754; SSE-NEXT: pand %xmm14, %xmm7 4755; SSE-NEXT: por %xmm7, %xmm1 4756; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4757; SSE-NEXT: movdqa %xmm11, %xmm8 4758; SSE-NEXT: movdqa %xmm11, %xmm2 4759; SSE-NEXT: pandn %xmm5, %xmm2 4760; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4761; SSE-NEXT: pand %xmm11, %xmm4 4762; SSE-NEXT: por %xmm2, %xmm4 4763; SSE-NEXT: movdqa %xmm4, %xmm2 4764; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 4765; SSE-NEXT: movdqa %xmm6, %xmm7 4766; SSE-NEXT: pandn %xmm2, %xmm7 4767; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 4768; SSE-NEXT: pand %xmm6, %xmm4 4769; SSE-NEXT: por %xmm7, %xmm4 4770; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7] 4771; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 4772; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 4773; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] 4774; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7] 4775; SSE-NEXT: packuswb %xmm4, %xmm4 4776; SSE-NEXT: movdqa %xmm12, %xmm3 4777; SSE-NEXT: pandn %xmm4, %xmm3 4778; SSE-NEXT: movdqa %xmm10, %xmm7 4779; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4780; SSE-NEXT: pandn %xmm5, %xmm7 4781; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4782; SSE-NEXT: movdqa %xmm0, %xmm14 4783; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4784; SSE-NEXT: pand %xmm1, %xmm14 4785; SSE-NEXT: movdqa %xmm15, %xmm11 4786; SSE-NEXT: pand %xmm1, %xmm11 4787; SSE-NEXT: movdqa %xmm13, %xmm4 4788; SSE-NEXT: pand %xmm1, %xmm4 4789; SSE-NEXT: movdqa %xmm5, %xmm2 4790; SSE-NEXT: pand %xmm1, %xmm2 4791; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4792; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4793; SSE-NEXT: pandn %xmm2, %xmm1 4794; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4795; SSE-NEXT: pand %xmm8, %xmm0 4796; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4797; SSE-NEXT: pand %xmm8, %xmm15 4798; SSE-NEXT: pand %xmm8, %xmm13 4799; SSE-NEXT: pand %xmm8, %xmm5 4800; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4801; SSE-NEXT: movdqa %xmm2, %xmm0 4802; SSE-NEXT: pandn %xmm2, %xmm8 4803; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4804; SSE-NEXT: pand %xmm10, %xmm0 4805; SSE-NEXT: por %xmm7, %xmm0 4806; SSE-NEXT: movdqa %xmm0, %xmm7 4807; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] 4808; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4809; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 4810; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] 4811; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] 4812; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7] 4813; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 4814; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4815; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 4816; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 4817; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 4818; SSE-NEXT: packuswb %xmm0, %xmm1 4819; SSE-NEXT: pand %xmm12, %xmm1 4820; SSE-NEXT: por %xmm3, %xmm1 4821; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4822; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 4823; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4824; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] 4825; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 4826; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 4827; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 4828; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 4829; SSE-NEXT: packuswb %xmm0, %xmm0 4830; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 4831; SSE-NEXT: movdqa %xmm10, %xmm2 4832; SSE-NEXT: pandn %xmm0, %xmm2 4833; SSE-NEXT: pand %xmm10, %xmm1 4834; SSE-NEXT: por %xmm1, %xmm2 4835; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4836; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4837; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 4838; SSE-NEXT: pand %xmm3, %xmm0 4839; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4840; SSE-NEXT: movdqa %xmm0, %xmm1 4841; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4842; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4843; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] 4844; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4845; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 4846; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] 4847; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] 4848; SSE-NEXT: packuswb %xmm0, %xmm0 4849; SSE-NEXT: movdqa %xmm12, %xmm1 4850; SSE-NEXT: pandn %xmm0, %xmm1 4851; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 4852; SSE-NEXT: movdqa %xmm14, %xmm0 4853; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4854; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] 4855; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] 4856; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7] 4857; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 4858; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 4859; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 4860; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4861; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] 4862; SSE-NEXT: packuswb %xmm2, %xmm0 4863; SSE-NEXT: pand %xmm12, %xmm0 4864; SSE-NEXT: por %xmm1, %xmm0 4865; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4866; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4867; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] 4868; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 4869; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 4870; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4871; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] 4872; SSE-NEXT: packuswb %xmm1, %xmm1 4873; SSE-NEXT: movdqa %xmm10, %xmm9 4874; SSE-NEXT: movdqa %xmm10, %xmm14 4875; SSE-NEXT: pandn %xmm1, %xmm14 4876; SSE-NEXT: pand %xmm10, %xmm0 4877; SSE-NEXT: por %xmm0, %xmm14 4878; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4879; SSE-NEXT: pand %xmm3, %xmm0 4880; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4881; SSE-NEXT: movdqa %xmm0, %xmm1 4882; SSE-NEXT: pxor %xmm2, %xmm2 4883; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 4884; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 4885; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] 4886; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4887; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 4888; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] 4889; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] 4890; SSE-NEXT: packuswb %xmm0, %xmm0 4891; SSE-NEXT: movdqa %xmm12, %xmm1 4892; SSE-NEXT: pandn %xmm0, %xmm1 4893; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4894; SSE-NEXT: movdqa %xmm11, %xmm0 4895; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 4896; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] 4897; SSE-NEXT: pxor %xmm10, %xmm10 4898; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0] 4899; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7] 4900; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 4901; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 4902; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 4903; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4904; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] 4905; SSE-NEXT: packuswb %xmm2, %xmm0 4906; SSE-NEXT: pand %xmm12, %xmm0 4907; SSE-NEXT: por %xmm1, %xmm0 4908; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4909; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4910; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] 4911; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 4912; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 4913; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4914; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] 4915; SSE-NEXT: packuswb %xmm1, %xmm1 4916; SSE-NEXT: movdqa %xmm9, %xmm11 4917; SSE-NEXT: pandn %xmm1, %xmm11 4918; SSE-NEXT: pand %xmm9, %xmm0 4919; SSE-NEXT: por %xmm0, %xmm11 4920; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 4921; SSE-NEXT: pand %xmm3, %xmm0 4922; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4923; SSE-NEXT: movdqa %xmm0, %xmm1 4924; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 4925; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 4926; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] 4927; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4928; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 4929; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] 4930; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] 4931; SSE-NEXT: packuswb %xmm0, %xmm0 4932; SSE-NEXT: movdqa %xmm12, %xmm1 4933; SSE-NEXT: pandn %xmm0, %xmm1 4934; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4935; SSE-NEXT: movdqa %xmm4, %xmm0 4936; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 4937; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 4938; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0] 4939; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] 4940; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 4941; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 4942; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 4943; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4944; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] 4945; SSE-NEXT: packuswb %xmm2, %xmm0 4946; SSE-NEXT: pand %xmm12, %xmm0 4947; SSE-NEXT: por %xmm1, %xmm0 4948; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4949; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4950; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] 4951; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 4952; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] 4953; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4954; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] 4955; SSE-NEXT: packuswb %xmm1, %xmm2 4956; SSE-NEXT: movdqa %xmm9, %xmm10 4957; SSE-NEXT: pandn %xmm2, %xmm10 4958; SSE-NEXT: pand %xmm9, %xmm0 4959; SSE-NEXT: por %xmm0, %xmm10 4960; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4961; SSE-NEXT: movdqa %xmm3, %xmm2 4962; SSE-NEXT: pand %xmm3, %xmm0 4963; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4964; SSE-NEXT: por %xmm0, %xmm2 4965; SSE-NEXT: movdqa %xmm2, %xmm0 4966; SSE-NEXT: pxor %xmm1, %xmm1 4967; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4968; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4969; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] 4970; SSE-NEXT: movaps %xmm2, %xmm4 4971; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4972; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4973; SSE-NEXT: movdqa %xmm2, %xmm0 4974; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4975; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 4976; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] 4977; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 4978; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 4979; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 4980; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 4981; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4982; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] 4983; SSE-NEXT: packuswb %xmm2, %xmm0 4984; SSE-NEXT: pand %xmm12, %xmm0 4985; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5] 4986; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] 4987; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 4988; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] 4989; SSE-NEXT: packuswb %xmm2, %xmm2 4990; SSE-NEXT: pandn %xmm2, %xmm12 4991; SSE-NEXT: por %xmm12, %xmm0 4992; SSE-NEXT: movdqa %xmm8, %xmm3 4993; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] 4994; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] 4995; SSE-NEXT: pand %xmm9, %xmm0 4996; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] 4997; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 4998; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] 4999; SSE-NEXT: packuswb %xmm2, %xmm2 5000; SSE-NEXT: pandn %xmm2, %xmm9 5001; SSE-NEXT: por %xmm0, %xmm9 5002; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5003; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5004; SSE-NEXT: movdqa %xmm3, %xmm0 5005; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5006; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5007; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] 5008; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2] 5009; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] 5010; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] 5011; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 5012; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5013; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] 5014; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5015; SSE-NEXT: packuswb %xmm0, %xmm2 5016; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] 5017; SSE-NEXT: movdqa %xmm4, %xmm3 5018; SSE-NEXT: pandn %xmm2, %xmm3 5019; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5020; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 5021; SSE-NEXT: pand %xmm12, %xmm8 5022; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 5023; SSE-NEXT: movdqa %xmm8, %xmm2 5024; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5025; SSE-NEXT: movdqa %xmm6, %xmm7 5026; SSE-NEXT: pandn %xmm2, %xmm7 5027; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] 5028; SSE-NEXT: pand %xmm6, %xmm8 5029; SSE-NEXT: por %xmm7, %xmm8 5030; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7] 5031; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] 5032; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 5033; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] 5034; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 5035; SSE-NEXT: packuswb %xmm2, %xmm2 5036; SSE-NEXT: pand %xmm4, %xmm2 5037; SSE-NEXT: por %xmm3, %xmm2 5038; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5039; SSE-NEXT: # xmm3 = mem[3,1,2,3] 5040; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5041; SSE-NEXT: # xmm7 = mem[0,2,2,3] 5042; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 5043; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 5044; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 5045; SSE-NEXT: packuswb %xmm0, %xmm7 5046; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] 5047; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5048; SSE-NEXT: movdqa %xmm15, %xmm0 5049; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5050; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] 5051; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3] 5052; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2] 5053; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1] 5054; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7] 5055; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 5056; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5057; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] 5058; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 5059; SSE-NEXT: packuswb %xmm0, %xmm3 5060; SSE-NEXT: movdqa %xmm4, %xmm7 5061; SSE-NEXT: pandn %xmm3, %xmm7 5062; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5063; SSE-NEXT: pand %xmm12, %xmm15 5064; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5065; SSE-NEXT: movdqa %xmm15, %xmm3 5066; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5067; SSE-NEXT: movdqa %xmm6, %xmm8 5068; SSE-NEXT: pandn %xmm3, %xmm8 5069; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] 5070; SSE-NEXT: pand %xmm6, %xmm15 5071; SSE-NEXT: por %xmm8, %xmm15 5072; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7] 5073; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] 5074; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 5075; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] 5076; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7] 5077; SSE-NEXT: packuswb %xmm8, %xmm8 5078; SSE-NEXT: pand %xmm4, %xmm8 5079; SSE-NEXT: por %xmm7, %xmm8 5080; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5081; SSE-NEXT: # xmm3 = mem[3,1,2,3] 5082; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5083; SSE-NEXT: # xmm7 = mem[0,2,2,3] 5084; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 5085; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 5086; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 5087; SSE-NEXT: packuswb %xmm0, %xmm7 5088; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1] 5089; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 5090; SSE-NEXT: movdqa %xmm13, %xmm0 5091; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5092; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] 5093; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] 5094; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2] 5095; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] 5096; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] 5097; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 5098; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5099; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] 5100; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 5101; SSE-NEXT: packuswb %xmm0, %xmm3 5102; SSE-NEXT: movdqa %xmm4, %xmm7 5103; SSE-NEXT: pandn %xmm3, %xmm7 5104; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload 5105; SSE-NEXT: pand %xmm12, %xmm13 5106; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 5107; SSE-NEXT: movdqa %xmm13, %xmm3 5108; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 5109; SSE-NEXT: movdqa %xmm6, %xmm5 5110; SSE-NEXT: pandn %xmm3, %xmm5 5111; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] 5112; SSE-NEXT: pand %xmm6, %xmm13 5113; SSE-NEXT: por %xmm5, %xmm13 5114; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7] 5115; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] 5116; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 5117; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] 5118; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7] 5119; SSE-NEXT: packuswb %xmm5, %xmm5 5120; SSE-NEXT: pand %xmm4, %xmm5 5121; SSE-NEXT: por %xmm7, %xmm5 5122; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5123; SSE-NEXT: # xmm3 = mem[3,1,2,3] 5124; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5125; SSE-NEXT: # xmm7 = mem[0,2,2,3] 5126; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 5127; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 5128; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 5129; SSE-NEXT: packuswb %xmm0, %xmm7 5130; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1] 5131; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5132; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5133; SSE-NEXT: movdqa %xmm7, %xmm0 5134; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5135; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] 5136; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] 5137; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2] 5138; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5139; SSE-NEXT: pand %xmm12, %xmm0 5140; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 5141; SSE-NEXT: por %xmm0, %xmm12 5142; SSE-NEXT: movdqa %xmm12, %xmm0 5143; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5144; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 5145; SSE-NEXT: pand %xmm6, %xmm12 5146; SSE-NEXT: pandn %xmm0, %xmm6 5147; SSE-NEXT: por %xmm12, %xmm6 5148; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7] 5149; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] 5150; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 5151; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] 5152; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 5153; SSE-NEXT: packuswb %xmm0, %xmm0 5154; SSE-NEXT: pand %xmm4, %xmm0 5155; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] 5156; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] 5157; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] 5158; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5159; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7] 5160; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 5161; SSE-NEXT: packuswb %xmm6, %xmm3 5162; SSE-NEXT: pandn %xmm3, %xmm4 5163; SSE-NEXT: por %xmm4, %xmm0 5164; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5165; SSE-NEXT: # xmm3 = mem[3,1,2,3] 5166; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5167; SSE-NEXT: # xmm4 = mem[0,2,2,3] 5168; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 5169; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] 5170; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 5171; SSE-NEXT: packuswb %xmm6, %xmm4 5172; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1] 5173; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5174; SSE-NEXT: movaps %xmm3, 16(%rsi) 5175; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5176; SSE-NEXT: movaps %xmm3, 48(%rsi) 5177; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5178; SSE-NEXT: movaps %xmm3, (%rsi) 5179; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5180; SSE-NEXT: movaps %xmm3, 32(%rsi) 5181; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5182; SSE-NEXT: movaps %xmm3, 16(%rdx) 5183; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5184; SSE-NEXT: movaps %xmm3, 48(%rdx) 5185; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5186; SSE-NEXT: movaps %xmm3, (%rdx) 5187; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5188; SSE-NEXT: movaps %xmm3, 32(%rdx) 5189; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5190; SSE-NEXT: movaps %xmm1, 16(%rcx) 5191; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5192; SSE-NEXT: movaps %xmm1, 48(%rcx) 5193; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5194; SSE-NEXT: movaps %xmm1, (%rcx) 5195; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5196; SSE-NEXT: movaps %xmm1, 32(%rcx) 5197; SSE-NEXT: movdqa %xmm9, 16(%r8) 5198; SSE-NEXT: movdqa %xmm10, 48(%r8) 5199; SSE-NEXT: movdqa %xmm11, (%r8) 5200; SSE-NEXT: movdqa %xmm14, 32(%r8) 5201; SSE-NEXT: movaps %xmm0, 16(%r9) 5202; SSE-NEXT: movaps %xmm5, 48(%r9) 5203; SSE-NEXT: movaps %xmm8, (%r9) 5204; SSE-NEXT: movaps %xmm2, 32(%r9) 5205; SSE-NEXT: addq $552, %rsp # imm = 0x228 5206; SSE-NEXT: retq 5207; 5208; AVX-LABEL: load_i8_stride5_vf64: 5209; AVX: # %bb.0: 5210; AVX-NEXT: subq $488, %rsp # imm = 0x1E8 5211; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] 5212; AVX-NEXT: vmovdqa (%rdi), %xmm8 5213; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 5214; AVX-NEXT: vmovdqa 32(%rdi), %xmm12 5215; AVX-NEXT: vmovdqa 48(%rdi), %xmm9 5216; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm0 5217; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5218; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] 5219; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1 5220; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5221; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 5222; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] 5223; AVX-NEXT: # xmm4 = mem[0,0] 5224; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm0 5225; AVX-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill 5226; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] 5227; AVX-NEXT: # xmm5 = mem[0,0] 5228; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 5229; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5230; AVX-NEXT: vpor %xmm0, %xmm6, %xmm6 5231; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 5232; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 5233; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5234; AVX-NEXT: vmovdqa 176(%rdi), %xmm14 5235; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2 5236; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5237; AVX-NEXT: vmovdqa 160(%rdi), %xmm13 5238; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 5239; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5240; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 5241; AVX-NEXT: vmovdqa 208(%rdi), %xmm10 5242; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3 5243; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5244; AVX-NEXT: vmovdqa 192(%rdi), %xmm1 5245; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm4 5246; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5247; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 5248; AVX-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 5249; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5250; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0] 5251; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm4 5252; AVX-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0] 5253; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm6 5254; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 5255; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] 5256; AVX-NEXT: # xmm7 = mem[0,0] 5257; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm6 5258; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] 5259; AVX-NEXT: # xmm8 = mem[0,0] 5260; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm9 5261; AVX-NEXT: vpor %xmm6, %xmm9, %xmm6 5262; AVX-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 5263; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5264; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 5265; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm4 5266; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 5267; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] 5268; AVX-NEXT: # xmm11 = mem[0,0] 5269; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4 5270; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm5 5271; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 5272; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 5273; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 5274; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5275; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm0 5276; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] 5277; AVX-NEXT: # xmm7 = mem[0,0] 5278; AVX-NEXT: vmovdqa 128(%rdi), %xmm13 5279; AVX-NEXT: vpshufb %xmm7, %xmm13, %xmm3 5280; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 5281; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] 5282; AVX-NEXT: # xmm5 = mem[0,0] 5283; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 5284; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5285; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm3 5286; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] 5287; AVX-NEXT: # xmm6 = mem[0,0] 5288; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 5289; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5290; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm12 5291; AVX-NEXT: vpor %xmm3, %xmm12, %xmm3 5292; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 5293; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] 5294; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u] 5295; AVX-NEXT: vmovdqa 80(%rdi), %xmm14 5296; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm15 5297; AVX-NEXT: vpor %xmm15, %xmm12, %xmm12 5298; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] 5299; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 5300; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload 5301; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] 5302; AVX-NEXT: vmovdqa 64(%rdi), %xmm1 5303; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5304; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm9 5305; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 5306; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0 5307; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 5308; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5309; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 5310; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5311; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm0 5312; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 5313; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5314; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm9 5315; AVX-NEXT: vpor %xmm0, %xmm9, %xmm0 5316; AVX-NEXT: vmovdqa 272(%rdi), %xmm10 5317; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm5 5318; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5319; AVX-NEXT: vmovdqa 256(%rdi), %xmm9 5320; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm6 5321; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 5322; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] 5323; AVX-NEXT: vmovdqa 240(%rdi), %xmm1 5324; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5325; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 5326; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 5327; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 5328; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload 5329; AVX-NEXT: vmovdqa 224(%rdi), %xmm5 5330; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 5331; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5332; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2 5333; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 5334; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 5335; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5336; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] 5337; AVX-NEXT: # xmm2 = mem[0,0] 5338; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm0 5339; AVX-NEXT: vmovdqa %xmm8, %xmm11 5340; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5341; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] 5342; AVX-NEXT: # xmm3 = mem[0,0] 5343; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5344; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm4 5345; AVX-NEXT: vpor %xmm0, %xmm4, %xmm4 5346; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5347; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] 5348; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5349; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] 5350; AVX-NEXT: vpor %xmm0, %xmm8, %xmm8 5351; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] 5352; AVX-NEXT: vmovdqa %xmm14, %xmm6 5353; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5354; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] 5355; AVX-NEXT: vpor %xmm14, %xmm8, %xmm8 5356; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] 5357; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload 5358; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] 5359; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5360; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm15 5361; AVX-NEXT: vandnps %ymm15, %ymm12, %ymm15 5362; AVX-NEXT: vorps %ymm15, %ymm8, %ymm8 5363; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 5364; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5365; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5366; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 5367; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5368; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm3 5369; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 5370; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] 5371; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] 5372; AVX-NEXT: vmovdqa %xmm9, %xmm8 5373; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5374; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 5375; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] 5376; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 5377; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5378; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] 5379; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 5380; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 5381; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload 5382; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm3 5383; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 5384; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 5385; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 5386; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5387; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] 5388; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 5389; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 5390; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] 5391; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] 5392; AVX-NEXT: # xmm4 = mem[0,0] 5393; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm5 5394; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 5395; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 5396; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u] 5397; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14 5398; AVX-NEXT: vpor %xmm3, %xmm14, %xmm3 5399; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] 5400; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5401; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] 5402; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] 5403; AVX-NEXT: # xmm3 = mem[0,0] 5404; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload 5405; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm12 5406; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] 5407; AVX-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0] 5408; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5409; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm0 5410; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] 5411; AVX-NEXT: # xmm9 = mem[0,0] 5412; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5413; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm13 5414; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] 5415; AVX-NEXT: vpor %xmm0, %xmm12, %xmm0 5416; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 5417; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 5418; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5419; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] 5420; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13 5421; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 5422; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 5423; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5424; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5425; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] 5426; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5427; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 5428; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 5429; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] 5430; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5431; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm4 5432; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 5433; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] 5434; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm4 5435; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1 5436; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] 5437; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5438; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] 5439; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5440; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 5441; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] 5442; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5443; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3 5444; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5445; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm2 5446; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] 5447; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 5448; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 5449; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5450; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] 5451; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2 5452; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 5453; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 5454; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5455; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] 5456; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] 5457; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] 5458; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 5459; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] 5460; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] 5461; AVX-NEXT: vpor %xmm0, %xmm1, %xmm3 5462; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5463; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] 5464; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5465; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 5466; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 5467; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] 5468; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u] 5469; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5470; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm1 5471; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 5472; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] 5473; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5474; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 5475; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 5476; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] 5477; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3 5478; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4 5479; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5480; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5481; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] 5482; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5483; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 5484; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 5485; AVX-NEXT: vextractf128 $1, %ymm3, %xmm7 5486; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255] 5487; AVX-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 5488; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 5489; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5490; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] 5491; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] 5492; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] 5493; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 5494; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] 5495; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] 5496; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 5497; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] 5498; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5499; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] 5500; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 5501; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] 5502; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm5 5503; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5504; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2 5505; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 5506; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 5507; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 5508; AVX-NEXT: vandps %ymm3, %ymm12, %ymm2 5509; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm0 5510; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 5511; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5512; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] 5513; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5514; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 5515; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 5516; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 5517; AVX-NEXT: vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255] 5518; AVX-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 5519; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 5520; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5521; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] 5522; AVX-NEXT: # xmm0 = mem[0,0] 5523; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm2 5524; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] 5525; AVX-NEXT: # xmm1 = mem[0,0] 5526; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm3 5527; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 5528; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5529; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] 5530; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] 5531; AVX-NEXT: # xmm3 = mem[0,0] 5532; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5533; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 5534; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 5535; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 5536; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] 5537; AVX-NEXT: vpor %xmm7, %xmm4, %xmm4 5538; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload 5539; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] 5540; AVX-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0] 5541; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5542; AVX-NEXT: vpshufb %xmm7, %xmm12, %xmm12 5543; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] 5544; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] 5545; AVX-NEXT: # xmm12 = mem[0,0] 5546; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5547; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm13 5548; AVX-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0] 5549; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5550; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15 5551; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] 5552; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 5553; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 5554; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] 5555; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5556; AVX-NEXT: vpshufb %xmm13, %xmm15, %xmm15 5557; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 5558; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] 5559; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 5560; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0 5561; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 5562; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 5563; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5564; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] 5565; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 5566; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 5567; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] 5568; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] 5569; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 5570; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 5571; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5572; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 5573; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5574; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 5575; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] 5576; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5577; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm4 5578; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5579; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 5580; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] 5581; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 5582; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5583; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm4 5584; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 5585; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] 5586; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 5587; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5588; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 5589; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5590; AVX-NEXT: vmovaps %ymm1, (%rsi) 5591; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5592; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 5593; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5594; AVX-NEXT: vmovaps %ymm1, (%rdx) 5595; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5596; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 5597; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5598; AVX-NEXT: vmovaps %ymm1, (%rcx) 5599; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5600; AVX-NEXT: vmovaps %ymm1, 32(%r8) 5601; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 5602; AVX-NEXT: vmovaps %ymm1, (%r8) 5603; AVX-NEXT: vmovaps %ymm0, 32(%r9) 5604; AVX-NEXT: vmovaps %ymm2, (%r9) 5605; AVX-NEXT: addq $488, %rsp # imm = 0x1E8 5606; AVX-NEXT: vzeroupper 5607; AVX-NEXT: retq 5608; 5609; AVX2-LABEL: load_i8_stride5_vf64: 5610; AVX2: # %bb.0: 5611; AVX2-NEXT: subq $136, %rsp 5612; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 5613; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 5614; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 5615; AVX2-NEXT: vmovdqa 256(%rdi), %ymm9 5616; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5617; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 5618; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5619; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 5620; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 5621; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 5622; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 5623; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5624; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 5625; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5626; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5627; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5628; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 5629; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 5630; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5631; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5632; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5633; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5634; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 5635; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5636; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5637; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5638; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 5639; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 5640; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5641; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5642; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5643; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5644; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5645; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5646; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5647; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5648; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 5649; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5650; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5651; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 5652; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 5653; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5654; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5655; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5656; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5657; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5658; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5659; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13 5660; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14 5661; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5662; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 5663; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5664; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] 5665; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 5666; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] 5667; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5668; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm1 5669; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] 5670; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 5671; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm15 5672; AVX2-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] 5673; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 5674; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5675; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm7 5676; AVX2-NEXT: vmovdqa (%rdi), %ymm3 5677; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 5678; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 5679; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8 5680; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm5 5681; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5682; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 5683; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 5684; AVX2-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 5685; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] 5686; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm5 5687; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 5688; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] 5689; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5690; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 5691; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] 5692; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 5693; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5694; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8 5695; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 5696; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0 5697; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 5698; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm6 5699; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 5700; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 5701; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 5702; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 5703; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5704; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 5705; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 5706; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] 5707; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 5708; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] 5709; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 5710; AVX2-NEXT: vpor %xmm4, %xmm0, %xmm0 5711; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] 5712; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 5713; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5714; AVX2-NEXT: vpshufb %ymm4, %ymm12, %ymm12 5715; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 5716; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5717; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5718; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 5719; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 5720; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 5721; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm5 5722; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 5723; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 5724; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 5725; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5726; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5727; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 5728; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] 5729; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5 5730; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 5731; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] 5732; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 5733; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 5734; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] 5735; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 5736; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5737; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm12 5738; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 5739; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5740; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5741; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 5742; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 5743; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 5744; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 5745; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 5746; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 5747; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 5748; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5749; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 5750; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5751; AVX2-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 5752; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] 5753; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 5754; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 5755; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 5756; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] 5757; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 5758; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5759; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] 5760; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 5761; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 5762; AVX2-NEXT: vmovdqa 304(%rdi), %xmm2 5763; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 5764; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm0 5765; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] 5766; AVX2-NEXT: vmovdqa 288(%rdi), %xmm1 5767; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm3 5768; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 5769; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5770; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload 5771; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] 5772; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 5773; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5774; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3 5775; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm4 5776; AVX2-NEXT: vmovdqa 128(%rdi), %xmm5 5777; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm12 5778; AVX2-NEXT: vpor %xmm4, %xmm12, %xmm4 5779; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5780; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] 5781; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] 5782; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5783; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] 5784; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm7 5785; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] 5786; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm15 5787; AVX2-NEXT: vpor %xmm7, %xmm15, %xmm7 5788; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5789; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] 5790; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] 5791; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5792; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm8 5793; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm12 5794; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8 5795; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5796; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] 5797; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] 5798; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] 5799; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm11 5800; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] 5801; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm14 5802; AVX2-NEXT: vpor %xmm11, %xmm14, %xmm11 5803; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 5804; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 5805; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 5806; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm12 5807; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm15 5808; AVX2-NEXT: vpor %xmm12, %xmm15, %xmm12 5809; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5810; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 5811; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm15 5812; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] 5813; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm15 5814; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] 5815; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm13 5816; AVX2-NEXT: vpor %xmm15, %xmm13, %xmm13 5817; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] 5818; AVX2-NEXT: # ymm15 = mem[0,1,0,1] 5819; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm10 5820; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] 5821; AVX2-NEXT: vmovdqa 288(%rdi), %ymm13 5822; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] 5823; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm13 5824; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] 5825; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm13 5826; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 5827; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 5828; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm13 5829; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm15 5830; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm6 5831; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 5832; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 5833; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] 5834; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7 5835; AVX2-NEXT: vpshufb %ymm4, %ymm7, %ymm4 5836; AVX2-NEXT: vpermd %ymm4, %ymm0, %ymm0 5837; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 5838; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] 5839; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 5840; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] 5841; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 5842; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 5843; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5844; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 5845; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] 5846; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 5847; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm2 5848; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm3 5849; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 5850; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5851; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5852; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] 5853; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 5854; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 5855; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) 5856; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5857; AVX2-NEXT: vmovaps %ymm3, (%rsi) 5858; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 5859; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) 5860; AVX2-NEXT: vmovdqa %ymm8, (%rdx) 5861; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 5862; AVX2-NEXT: vmovdqa %ymm2, (%rcx) 5863; AVX2-NEXT: vmovdqa %ymm11, 32(%r8) 5864; AVX2-NEXT: vmovdqa %ymm12, (%r8) 5865; AVX2-NEXT: vmovdqa %ymm10, 32(%r9) 5866; AVX2-NEXT: vmovdqa %ymm0, (%r9) 5867; AVX2-NEXT: addq $136, %rsp 5868; AVX2-NEXT: vzeroupper 5869; AVX2-NEXT: retq 5870; 5871; AVX2-FP-LABEL: load_i8_stride5_vf64: 5872; AVX2-FP: # %bb.0: 5873; AVX2-FP-NEXT: subq $136, %rsp 5874; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 5875; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 5876; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10 5877; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm9 5878; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5879; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 5880; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5881; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 5882; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5883; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 5884; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 5885; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5886; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 5887; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5888; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5889; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5890; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 5891; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5892; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5893; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5894; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5895; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5896; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 5897; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5898; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5899; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5900; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 5901; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5902; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5903; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5904; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5905; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5906; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5907; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5908; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5909; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5910; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 5911; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 5912; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5913; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 5914; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5915; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5916; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5917; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 5918; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 5919; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 5920; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5921; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13 5922; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 5923; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5924; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 5925; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 5926; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] 5927; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 5928; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] 5929; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5930; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm1 5931; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] 5932; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 5933; AVX2-FP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 5934; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] 5935; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 5936; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5937; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 5938; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 5939; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 5940; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 5941; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm8 5942; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 5943; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5944; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 5945; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 5946; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 5947; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] 5948; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm5 5949; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 5950; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] 5951; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5952; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 5953; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] 5954; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 5955; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 5956; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 5957; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 5958; AVX2-FP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 5959; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 5960; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 5961; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 5962; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 5963; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4 5964; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 5965; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 5966; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 5967; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4 5968; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] 5969; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 5970; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] 5971; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 5972; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0 5973; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] 5974; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 5975; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5976; AVX2-FP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 5977; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 5978; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5979; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5980; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 5981; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 5982; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm12 5983; AVX2-FP-NEXT: vpshufb %xmm5, %xmm12, %xmm5 5984; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 5985; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 5986; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 5987; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5988; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 5989; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 5990; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] 5991; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 5992; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 5993; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] 5994; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 5995; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 5996; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] 5997; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 5998; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 5999; AVX2-FP-NEXT: vpshufb %ymm5, %ymm12, %ymm12 6000; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 6001; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6002; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6003; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 6004; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 6005; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 6006; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 6007; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 6008; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 6009; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 6010; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 6011; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 6012; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6013; AVX2-FP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6014; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] 6015; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 6016; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 6017; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 6018; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] 6019; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 6020; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6021; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] 6022; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 6023; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 6024; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm2 6025; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 6026; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 6027; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] 6028; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm1 6029; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 6030; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0 6031; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6032; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload 6033; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] 6034; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 6035; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6036; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm3 6037; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 6038; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5 6039; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 6040; AVX2-FP-NEXT: vpor %xmm4, %xmm12, %xmm4 6041; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 6042; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] 6043; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] 6044; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6045; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] 6046; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm7 6047; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] 6048; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm15 6049; AVX2-FP-NEXT: vpor %xmm7, %xmm15, %xmm7 6050; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6051; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] 6052; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] 6053; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6054; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm8 6055; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm12 6056; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8 6057; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6058; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] 6059; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] 6060; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] 6061; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm11 6062; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] 6063; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 6064; AVX2-FP-NEXT: vpor %xmm11, %xmm14, %xmm11 6065; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 6066; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 6067; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 6068; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 6069; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 6070; AVX2-FP-NEXT: vpor %xmm12, %xmm15, %xmm12 6071; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6072; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 6073; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm15 6074; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] 6075; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm15 6076; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] 6077; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm13 6078; AVX2-FP-NEXT: vpor %xmm15, %xmm13, %xmm13 6079; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] 6080; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1] 6081; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 6082; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] 6083; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm13 6084; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] 6085; AVX2-FP-NEXT: vpshufb %ymm4, %ymm13, %ymm13 6086; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] 6087; AVX2-FP-NEXT: vpermd %ymm13, %ymm0, %ymm13 6088; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 6089; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6090; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 6091; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm15 6092; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 6093; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 6094; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 6095; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] 6096; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm7 6097; AVX2-FP-NEXT: vpshufb %ymm4, %ymm7, %ymm4 6098; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm0 6099; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 6100; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] 6101; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6102; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] 6103; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 6104; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 6105; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6106; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6107; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] 6108; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6109; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 6110; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 6111; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 6112; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6113; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6114; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] 6115; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 6116; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 6117; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) 6118; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6119; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) 6120; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6121; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) 6122; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx) 6123; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rcx) 6124; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx) 6125; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r8) 6126; AVX2-FP-NEXT: vmovdqa %ymm12, (%r8) 6127; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%r9) 6128; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) 6129; AVX2-FP-NEXT: addq $136, %rsp 6130; AVX2-FP-NEXT: vzeroupper 6131; AVX2-FP-NEXT: retq 6132; 6133; AVX2-FCP-LABEL: load_i8_stride5_vf64: 6134; AVX2-FCP: # %bb.0: 6135; AVX2-FCP-NEXT: subq $136, %rsp 6136; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 6137; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 6138; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 6139; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 6140; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 6141; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 6142; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6143; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 6144; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 6145; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 6146; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 6147; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6148; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 6149; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 6150; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 6151; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6152; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] 6153; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 6154; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6155; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6156; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 6157; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6158; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 6159; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 6160; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 6161; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6162; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] 6163; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 6164; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6165; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6166; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 6167; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6168; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6169; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6170; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6171; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6172; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 6173; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 6174; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6175; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] 6176; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 6177; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6178; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6179; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 6180; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6181; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6182; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6183; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 6184; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 6185; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 6186; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 6187; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 6188; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] 6189; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 6190; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] 6191; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 6192; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 6193; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] 6194; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 6195; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 6196; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] 6197; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 6198; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6199; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 6200; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 6201; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 6202; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 6203; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 6204; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 6205; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 6206; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 6207; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 6208; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 6209; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] 6210; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm5 6211; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 6212; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] 6213; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 6214; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 6215; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] 6216; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 6217; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6218; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 6219; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 6220; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 6221; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 6222; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 6223; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 6224; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 6225; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 6226; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 6227; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 6228; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 6229; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 6230; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] 6231; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 6232; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] 6233; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 6234; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 6235; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] 6236; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 6237; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6238; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 6239; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 6240; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6241; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6242; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 6243; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 6244; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm12 6245; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm5 6246; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 6247; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 6248; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 6249; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6250; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] 6251; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 6252; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] 6253; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 6254; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 6255; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] 6256; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 6257; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 6258; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] 6259; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 6260; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6261; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm12 6262; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 6263; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6264; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6265; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 6266; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 6267; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 6268; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 6269; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 6270; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 6271; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 6272; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] 6273; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 6274; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6275; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6276; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] 6277; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] 6278; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 6279; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 6280; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] 6281; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 6282; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6283; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] 6284; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] 6285; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 6286; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm2 6287; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 6288; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 6289; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] 6290; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1 6291; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 6292; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 6293; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6294; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload 6295; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] 6296; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 6297; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6298; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm3 6299; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 6300; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 6301; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 6302; AVX2-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 6303; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 6304; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] 6305; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] 6306; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6307; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] 6308; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm7 6309; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] 6310; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm15 6311; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 6312; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6313; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] 6314; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] 6315; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6316; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm8 6317; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm12 6318; AVX2-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 6319; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6320; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] 6321; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] 6322; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] 6323; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm11 6324; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] 6325; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 6326; AVX2-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 6327; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 6328; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] 6329; AVX2-FCP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 6330; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 6331; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 6332; AVX2-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12 6333; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6334; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 6335; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 6336; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] 6337; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm15 6338; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] 6339; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm13 6340; AVX2-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 6341; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] 6342; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1] 6343; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 6344; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] 6345; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm13 6346; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] 6347; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm13 6348; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] 6349; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 6350; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 6351; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6352; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 6353; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm15 6354; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 6355; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 6356; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 6357; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] 6358; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 6359; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4 6360; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0 6361; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 6362; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] 6363; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6364; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] 6365; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 6366; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 6367; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6368; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6369; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] 6370; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6371; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 6372; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 6373; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 6374; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6375; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6376; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] 6377; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 6378; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 6379; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) 6380; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6381; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) 6382; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6383; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) 6384; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx) 6385; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) 6386; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) 6387; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r8) 6388; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r8) 6389; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%r9) 6390; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) 6391; AVX2-FCP-NEXT: addq $136, %rsp 6392; AVX2-FCP-NEXT: vzeroupper 6393; AVX2-FCP-NEXT: retq 6394; 6395; AVX512-LABEL: load_i8_stride5_vf64: 6396; AVX512: # %bb.0: 6397; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 6398; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 6399; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 6400; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 6401; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 6402; AVX512-NEXT: vmovdqa %ymm5, %ymm4 6403; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6404; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 6405; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) 6406; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 6407; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 6408; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 6409; AVX512-NEXT: vmovdqa %ymm4, %ymm8 6410; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) 6411; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 6412; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 6413; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] 6414; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 6415; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 6416; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 6417; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 6418; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 6419; AVX512-NEXT: vmovdqa %ymm4, %ymm11 6420; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) 6421; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 6422; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) 6423; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 6424; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 6425; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 6426; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] 6427; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 6428; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 6429; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 6430; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 6431; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 6432; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 6433; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 6434; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 6435; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6436; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 6437; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 6438; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) 6439; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 6440; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 6441; AVX512-NEXT: vmovdqa %ymm5, %ymm10 6442; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) 6443; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 6444; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] 6445; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero 6446; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 6447; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6448; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] 6449; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 6450; AVX512-NEXT: vmovdqa %ymm4, %ymm0 6451; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6452; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 6453; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] 6454; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero 6455; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 6456; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6457; AVX512-NEXT: vmovdqa %ymm5, %ymm6 6458; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) 6459; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) 6460; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 6461; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 6462; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] 6463; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 6464; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] 6465; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 6466; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 6467; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 6468; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6469; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 6470; AVX512-NEXT: vmovdqa %ymm10, %ymm0 6471; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6472; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 6473; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 6474; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 6475; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 6476; AVX512-NEXT: vmovdqa %ymm5, %ymm2 6477; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) 6478; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] 6479; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 6480; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] 6481; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 6482; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 6483; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6484; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 6485; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 6486; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6487; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) 6488; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 6489; AVX512-NEXT: vmovdqa %ymm5, %ymm0 6490; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6491; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero 6492; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 6493; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] 6494; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 6495; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6496; AVX512-NEXT: vmovdqa %ymm4, %ymm1 6497; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6498; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6499; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 6500; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] 6501; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6502; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 6503; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6504; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6505; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6506; AVX512-NEXT: vmovdqa %ymm4, %ymm0 6507; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6508; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6509; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6510; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 6511; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6512; AVX512-NEXT: vmovdqa %ymm10, %ymm3 6513; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6514; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 6515; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 6516; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] 6517; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 6518; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 6519; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6520; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 6521; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 6522; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6523; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) 6524; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 6525; AVX512-NEXT: vmovdqa %ymm10, %ymm0 6526; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) 6527; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 6528; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] 6529; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero 6530; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 6531; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6532; AVX512-NEXT: vmovdqa %ymm5, %ymm1 6533; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6534; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6535; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 6536; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] 6537; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6538; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 6539; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6540; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6541; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6542; AVX512-NEXT: vmovdqa %ymm5, %ymm0 6543; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6544; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6545; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6546; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 6547; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6548; AVX512-NEXT: vmovdqa %ymm4, %ymm3 6549; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6550; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] 6551; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 6552; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] 6553; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 6554; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 6555; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6556; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 6557; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 6558; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6559; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 6560; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) 6561; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 6562; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) 6563; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero 6564; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 6565; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 6566; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 6567; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6568; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) 6569; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) 6570; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6571; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] 6572; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6573; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 6574; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] 6575; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] 6576; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) 6577; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) 6578; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 6579; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 6580; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] 6581; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 6582; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6583; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 6584; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) 6585; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 6586; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] 6587; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 6588; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 6589; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 6590; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) 6591; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 6592; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) 6593; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) 6594; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) 6595; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) 6596; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) 6597; AVX512-NEXT: vzeroupper 6598; AVX512-NEXT: retq 6599; 6600; AVX512-FCP-LABEL: load_i8_stride5_vf64: 6601; AVX512-FCP: # %bb.0: 6602; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 6603; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 6604; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 6605; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 6606; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 6607; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 6608; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6609; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 6610; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) 6611; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 6612; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 6613; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 6614; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 6615; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) 6616; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 6617; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 6618; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] 6619; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 6620; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 6621; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 6622; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 6623; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 6624; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 6625; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) 6626; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 6627; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) 6628; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 6629; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 6630; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 6631; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] 6632; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 6633; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 6634; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 6635; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 6636; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 6637; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 6638; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 6639; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 6640; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6641; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 6642; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 6643; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) 6644; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 6645; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 6646; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 6647; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) 6648; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 6649; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] 6650; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero 6651; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 6652; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6653; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] 6654; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 6655; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 6656; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6657; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 6658; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] 6659; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero 6660; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 6661; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6662; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 6663; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) 6664; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) 6665; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 6666; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 6667; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] 6668; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 6669; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] 6670; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 6671; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 6672; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 6673; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6674; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 6675; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 6676; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6677; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 6678; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 6679; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 6680; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 6681; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 6682; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) 6683; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] 6684; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 6685; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] 6686; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 6687; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 6688; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6689; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 6690; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 6691; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6692; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) 6693; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 6694; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 6695; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6696; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero 6697; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 6698; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] 6699; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 6700; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6701; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 6702; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6703; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6704; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 6705; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] 6706; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6707; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 6708; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6709; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6710; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6711; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 6712; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6713; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6714; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6715; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 6716; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6717; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 6718; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6719; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 6720; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 6721; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] 6722; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 6723; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 6724; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6725; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 6726; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 6727; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6728; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) 6729; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 6730; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 6731; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) 6732; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 6733; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] 6734; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero 6735; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 6736; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6737; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 6738; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6739; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6740; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 6741; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] 6742; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6743; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 6744; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6745; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6746; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6747; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 6748; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6749; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6750; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6751; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 6752; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6753; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 6754; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6755; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] 6756; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 6757; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] 6758; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 6759; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 6760; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6761; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 6762; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 6763; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6764; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 6765; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) 6766; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 6767; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) 6768; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero 6769; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 6770; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 6771; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 6772; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6773; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) 6774; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) 6775; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6776; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] 6777; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6778; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 6779; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] 6780; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] 6781; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) 6782; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) 6783; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 6784; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 6785; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] 6786; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 6787; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6788; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 6789; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) 6790; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 6791; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] 6792; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 6793; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 6794; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 6795; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) 6796; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 6797; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) 6798; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) 6799; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 6800; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) 6801; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 6802; AVX512-FCP-NEXT: vzeroupper 6803; AVX512-FCP-NEXT: retq 6804; 6805; AVX512DQ-LABEL: load_i8_stride5_vf64: 6806; AVX512DQ: # %bb.0: 6807; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 6808; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 6809; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 6810; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 6811; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 6812; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 6813; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6814; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 6815; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) 6816; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 6817; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 6818; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 6819; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 6820; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) 6821; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 6822; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 6823; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] 6824; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 6825; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 6826; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 6827; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 6828; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 6829; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 6830; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) 6831; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 6832; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) 6833; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 6834; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 6835; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 6836; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] 6837; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 6838; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 6839; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 6840; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 6841; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 6842; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 6843; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 6844; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 6845; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6846; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 6847; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 6848; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) 6849; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 6850; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 6851; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 6852; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) 6853; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 6854; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] 6855; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero 6856; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 6857; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6858; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] 6859; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 6860; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 6861; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6862; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 6863; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] 6864; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero 6865; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 6866; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6867; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 6868; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) 6869; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) 6870; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 6871; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 6872; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] 6873; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 6874; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] 6875; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 6876; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 6877; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 6878; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6879; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 6880; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 6881; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6882; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 6883; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 6884; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 6885; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 6886; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 6887; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) 6888; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] 6889; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 6890; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] 6891; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 6892; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 6893; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6894; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 6895; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 6896; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6897; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) 6898; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 6899; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 6900; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 6901; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero 6902; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 6903; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] 6904; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 6905; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6906; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 6907; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6908; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6909; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 6910; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] 6911; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6912; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 6913; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6914; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6915; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6916; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 6917; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6918; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6919; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6920; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 6921; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6922; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 6923; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6924; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 6925; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 6926; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] 6927; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 6928; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 6929; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6930; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 6931; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 6932; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6933; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) 6934; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 6935; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 6936; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) 6937; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 6938; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] 6939; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero 6940; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 6941; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6942; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 6943; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 6944; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 6945; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 6946; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] 6947; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6948; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 6949; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6950; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 6951; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 6952; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 6953; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 6954; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6955; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 6956; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 6957; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 6958; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 6959; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 6960; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] 6961; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 6962; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] 6963; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 6964; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 6965; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 6966; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 6967; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 6968; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6969; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 6970; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) 6971; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 6972; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) 6973; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero 6974; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 6975; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 6976; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 6977; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6978; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) 6979; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) 6980; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6981; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] 6982; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 6983; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 6984; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] 6985; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] 6986; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) 6987; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) 6988; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 6989; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 6990; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] 6991; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 6992; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 6993; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 6994; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) 6995; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 6996; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] 6997; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 6998; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 6999; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 7000; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) 7001; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 7002; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) 7003; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) 7004; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) 7005; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) 7006; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) 7007; AVX512DQ-NEXT: vzeroupper 7008; AVX512DQ-NEXT: retq 7009; 7010; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64: 7011; AVX512DQ-FCP: # %bb.0: 7012; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] 7013; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 7014; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 7015; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 7016; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 7017; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 7018; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 7019; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 7020; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) 7021; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] 7022; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 7023; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] 7024; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 7025; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) 7026; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 7027; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] 7028; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] 7029; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 7030; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] 7031; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 7032; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 7033; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 7034; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 7035; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) 7036; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 7037; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) 7038; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 7039; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 7040; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 7041; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] 7042; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 7043; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 7044; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 7045; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 7046; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 7047; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 7048; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 7049; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 7050; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 7051; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 7052; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 7053; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) 7054; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 7055; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 7056; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 7057; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) 7058; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 7059; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] 7060; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero 7061; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 7062; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7063; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] 7064; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 7065; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 7066; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 7067; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 7068; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] 7069; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero 7070; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 7071; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7072; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 7073; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) 7074; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) 7075; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 7076; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 7077; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] 7078; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 7079; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] 7080; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 7081; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 7082; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 7083; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 7084; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] 7085; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 7086; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 7087; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 7088; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 7089; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] 7090; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 7091; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 7092; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) 7093; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] 7094; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 7095; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] 7096; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 7097; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 7098; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 7099; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 7100; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 7101; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7102; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) 7103; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 7104; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 7105; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) 7106; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero 7107; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 7108; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] 7109; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 7110; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7111; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 7112; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 7113; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 7114; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 7115; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] 7116; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 7117; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 7118; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 7119; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7120; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 7121; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 7122; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 7123; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 7124; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 7125; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] 7126; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7127; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 7128; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 7129; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 7130; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] 7131; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] 7132; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 7133; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 7134; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 7135; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 7136; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 7137; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7138; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) 7139; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 7140; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 7141; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) 7142; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 7143; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] 7144; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero 7145; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 7146; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7147; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 7148; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) 7149; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) 7150; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 7151; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] 7152; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 7153; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 7154; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 7155; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7156; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) 7157; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 7158; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) 7159; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 7160; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 7161; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 7162; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7163; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 7164; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) 7165; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] 7166; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 7167; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] 7168; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 7169; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 7170; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 7171; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 7172; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 7173; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7174; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 7175; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) 7176; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 7177; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) 7178; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero 7179; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 7180; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 7181; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 7182; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7183; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) 7184; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) 7185; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 7186; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] 7187; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] 7188; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 7189; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] 7190; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] 7191; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) 7192; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) 7193; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 7194; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] 7195; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] 7196; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 7197; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) 7198; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 7199; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) 7200; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 7201; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] 7202; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 7203; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 7204; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 7205; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) 7206; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 7207; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) 7208; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) 7209; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 7210; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) 7211; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 7212; AVX512DQ-FCP-NEXT: vzeroupper 7213; AVX512DQ-FCP-NEXT: retq 7214; 7215; AVX512BW-LABEL: load_i8_stride5_vf64: 7216; AVX512BW: # %bb.0: 7217; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 7218; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 7219; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 7220; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1 7221; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294 7222; AVX512BW-NEXT: kmovd %eax, %k2 7223; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} 7224; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 7225; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 7226; AVX512BW-NEXT: kmovd %eax, %k1 7227; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} 7228; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52 7229; AVX512BW-NEXT: kmovd %eax, %k1 7230; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} 7231; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 7232; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 7233; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 7234; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9 7235; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 7236; AVX512BW-NEXT: kmovd %eax, %k5 7237; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 7238; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 7239; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm5 7240; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} 7241; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 7242; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084 7243; AVX512BW-NEXT: kmovd %eax, %k3 7244; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} 7245; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 7246; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 7247; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 7248; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] 7249; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 7250; AVX512BW-NEXT: movl $127, %eax 7251; AVX512BW-NEXT: kmovd %eax, %k4 7252; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} 7253; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 7254; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] 7255; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 7256; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 7257; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 7258; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 7259; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 7260; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} 7261; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 7262; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 7263; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7 7264; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} 7265; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 7266; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] 7267; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero 7268; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 7269; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7270; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 7271; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 7272; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A 7273; AVX512BW-NEXT: kmovd %eax, %k3 7274; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} 7275; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7276; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 7277; AVX512BW-NEXT: kmovd %eax, %k6 7278; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7279; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 7280; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 7281; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 7282; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 7283; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14 7284; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 7285; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} 7286; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7287; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 7288; AVX512BW-NEXT: kmovd %eax, %k6 7289; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7290; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 7291; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10 7292; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] 7293; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 7294; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] 7295; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 7296; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7297; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] 7298; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 7299; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 7300; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7301; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 7302; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} 7303; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 7304; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} 7305; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7306; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] 7307; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero 7308; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7309; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7310; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 7311; AVX512BW-NEXT: kmovd %eax, %k4 7312; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7313; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 7314; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} 7315; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] 7316; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 7317; AVX512BW-NEXT: kmovd %eax, %k6 7318; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} 7319; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} 7320; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7321; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] 7322; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] 7323; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7324; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 7325; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} 7326; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] 7327; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210 7328; AVX512BW-NEXT: kmovd %eax, %k6 7329; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} 7330; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 7331; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] 7332; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7333; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4 7334; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] 7335; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 7336; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] 7337; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 7338; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15 7339; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7340; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 7341; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} 7342; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 7343; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} 7344; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero 7345; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 7346; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] 7347; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15 7348; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7349; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} 7350; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 7351; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} 7352; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] 7353; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 7354; AVX512BW-NEXT: kmovd %eax, %k5 7355; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} 7356; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} 7357; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] 7358; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 7359; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] 7360; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7361; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 7362; AVX512BW-NEXT: kmovd %eax, %k5 7363; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 7364; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} 7365; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] 7366; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421 7367; AVX512BW-NEXT: kmovd %eax, %k5 7368; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} 7369; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 7370; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] 7371; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7372; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9 7373; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] 7374; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 7375; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] 7376; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 7377; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 7378; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7379; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 7380; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF 7381; AVX512BW-NEXT: kmovq %rax, %k5 7382; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} 7383; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 7384; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} 7385; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12 7386; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] 7387; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero 7388; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 7389; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 7390; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} 7391; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 7392; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} 7393; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 7394; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 7395; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 7396; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 7397; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 7398; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7399; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000 7400; AVX512BW-NEXT: kmovd %eax, %k2 7401; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 7402; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 7403; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7404; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} 7405; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] 7406; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842 7407; AVX512BW-NEXT: kmovd %eax, %k2 7408; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} 7409; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 7410; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] 7411; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7412; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 7413; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 7414; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7415; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 7416; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 7417; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 7418; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 7419; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} 7420; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7421; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} 7422; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero 7423; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm3 7424; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 7425; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 7426; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7427; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} 7428; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7429; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi) 7430; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) 7431; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) 7432; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) 7433; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) 7434; AVX512BW-NEXT: vzeroupper 7435; AVX512BW-NEXT: retq 7436; 7437; AVX512BW-FCP-LABEL: load_i8_stride5_vf64: 7438; AVX512BW-FCP: # %bb.0: 7439; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 7440; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 7441; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 7442; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 7443; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 7444; AVX512BW-FCP-NEXT: kmovd %eax, %k2 7445; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} 7446; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 7447; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 7448; AVX512BW-FCP-NEXT: kmovd %eax, %k1 7449; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} 7450; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 7451; AVX512BW-FCP-NEXT: kmovd %eax, %k1 7452; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} 7453; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 7454; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 7455; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 7456; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 7457; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 7458; AVX512BW-FCP-NEXT: kmovd %eax, %k5 7459; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 7460; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 7461; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 7462; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} 7463; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 7464; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 7465; AVX512BW-FCP-NEXT: kmovd %eax, %k3 7466; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} 7467; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 7468; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 7469; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 7470; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] 7471; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 7472; AVX512BW-FCP-NEXT: movl $127, %eax 7473; AVX512BW-FCP-NEXT: kmovd %eax, %k4 7474; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} 7475; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 7476; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] 7477; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 7478; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 7479; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 7480; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 7481; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 7482; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} 7483; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 7484; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 7485; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 7486; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} 7487; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 7488; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] 7489; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero 7490; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 7491; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7492; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 7493; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 7494; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 7495; AVX512BW-FCP-NEXT: kmovd %eax, %k3 7496; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} 7497; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7498; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 7499; AVX512BW-FCP-NEXT: kmovd %eax, %k6 7500; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7501; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 7502; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 7503; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 7504; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 7505; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 7506; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 7507; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} 7508; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7509; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 7510; AVX512BW-FCP-NEXT: kmovd %eax, %k6 7511; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7512; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 7513; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 7514; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] 7515; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 7516; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] 7517; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 7518; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7519; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] 7520; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 7521; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 7522; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7523; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 7524; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} 7525; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 7526; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} 7527; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7528; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] 7529; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero 7530; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 7531; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7532; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 7533; AVX512BW-FCP-NEXT: kmovd %eax, %k4 7534; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7535; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 7536; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} 7537; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] 7538; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 7539; AVX512BW-FCP-NEXT: kmovd %eax, %k6 7540; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} 7541; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} 7542; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7543; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] 7544; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] 7545; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 7546; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 7547; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} 7548; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] 7549; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 7550; AVX512BW-FCP-NEXT: kmovd %eax, %k6 7551; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} 7552; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 7553; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] 7554; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7555; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 7556; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] 7557; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 7558; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] 7559; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 7560; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 7561; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7562; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 7563; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} 7564; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 7565; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} 7566; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero 7567; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 7568; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] 7569; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 7570; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7571; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} 7572; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 7573; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} 7574; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] 7575; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 7576; AVX512BW-FCP-NEXT: kmovd %eax, %k5 7577; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} 7578; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} 7579; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] 7580; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 7581; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] 7582; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 7583; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 7584; AVX512BW-FCP-NEXT: kmovd %eax, %k5 7585; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 7586; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} 7587; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] 7588; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 7589; AVX512BW-FCP-NEXT: kmovd %eax, %k5 7590; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} 7591; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 7592; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] 7593; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7594; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 7595; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] 7596; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 7597; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] 7598; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 7599; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 7600; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7601; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 7602; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF 7603; AVX512BW-FCP-NEXT: kmovq %rax, %k5 7604; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} 7605; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 7606; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} 7607; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7608; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] 7609; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero 7610; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7611; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 7612; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} 7613; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 7614; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} 7615; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 7616; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 7617; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 7618; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 7619; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 7620; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7621; AVX512BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000 7622; AVX512BW-FCP-NEXT: kmovd %eax, %k2 7623; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 7624; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 7625; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7626; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} 7627; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] 7628; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 7629; AVX512BW-FCP-NEXT: kmovd %eax, %k2 7630; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} 7631; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 7632; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] 7633; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7634; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 7635; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 7636; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7637; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 7638; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 7639; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 7640; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 7641; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} 7642; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7643; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} 7644; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero 7645; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 7646; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 7647; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 7648; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7649; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} 7650; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7651; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) 7652; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 7653; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 7654; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) 7655; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 7656; AVX512BW-FCP-NEXT: vzeroupper 7657; AVX512BW-FCP-NEXT: retq 7658; 7659; AVX512DQ-BW-LABEL: load_i8_stride5_vf64: 7660; AVX512DQ-BW: # %bb.0: 7661; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 7662; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 7663; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0 7664; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm1 7665; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294 7666; AVX512DQ-BW-NEXT: kmovd %eax, %k2 7667; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} 7668; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 7669; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 7670; AVX512DQ-BW-NEXT: kmovd %eax, %k1 7671; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} 7672; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52 7673; AVX512DQ-BW-NEXT: kmovd %eax, %k1 7674; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} 7675; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 7676; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 7677; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 7678; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9 7679; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 7680; AVX512DQ-BW-NEXT: kmovd %eax, %k5 7681; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 7682; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 7683; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm5 7684; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} 7685; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 7686; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084 7687; AVX512DQ-BW-NEXT: kmovd %eax, %k3 7688; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} 7689; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 7690; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 7691; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 7692; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] 7693; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 7694; AVX512DQ-BW-NEXT: movl $127, %eax 7695; AVX512DQ-BW-NEXT: kmovd %eax, %k4 7696; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} 7697; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 7698; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] 7699; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12 7700; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 7701; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 7702; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 7703; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 7704; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} 7705; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 7706; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8 7707; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7 7708; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} 7709; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 7710; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] 7711; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero 7712; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 7713; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7714; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 7715; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 7716; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A 7717; AVX512DQ-BW-NEXT: kmovd %eax, %k3 7718; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} 7719; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7720; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 7721; AVX512DQ-BW-NEXT: kmovd %eax, %k6 7722; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7723; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 7724; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 7725; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 7726; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 7727; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14 7728; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 7729; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} 7730; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7731; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108 7732; AVX512DQ-BW-NEXT: kmovd %eax, %k6 7733; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7734; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 7735; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10 7736; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] 7737; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 7738; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] 7739; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 7740; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7741; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] 7742; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 7743; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 7744; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7745; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 7746; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} 7747; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 7748; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} 7749; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7750; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] 7751; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero 7752; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7753; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7754; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 7755; AVX512DQ-BW-NEXT: kmovd %eax, %k4 7756; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7757; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 7758; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} 7759; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] 7760; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 7761; AVX512DQ-BW-NEXT: kmovd %eax, %k6 7762; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} 7763; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} 7764; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7765; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] 7766; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] 7767; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7768; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 7769; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} 7770; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] 7771; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210 7772; AVX512DQ-BW-NEXT: kmovd %eax, %k6 7773; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} 7774; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 7775; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] 7776; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7777; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4 7778; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] 7779; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 7780; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] 7781; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 7782; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15 7783; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7784; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 7785; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} 7786; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 7787; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} 7788; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero 7789; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 7790; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] 7791; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15 7792; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 7793; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} 7794; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 7795; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} 7796; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] 7797; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 7798; AVX512DQ-BW-NEXT: kmovd %eax, %k5 7799; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} 7800; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} 7801; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] 7802; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 7803; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] 7804; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 7805; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 7806; AVX512DQ-BW-NEXT: kmovd %eax, %k5 7807; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 7808; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} 7809; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] 7810; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421 7811; AVX512DQ-BW-NEXT: kmovd %eax, %k5 7812; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} 7813; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 7814; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] 7815; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7816; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9 7817; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] 7818; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 7819; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] 7820; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 7821; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 7822; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7823; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 7824; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF 7825; AVX512DQ-BW-NEXT: kmovq %rax, %k5 7826; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} 7827; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 7828; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} 7829; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12 7830; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] 7831; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero 7832; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 7833; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 7834; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} 7835; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 7836; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} 7837; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 7838; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 7839; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 7840; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 7841; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 7842; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7843; AVX512DQ-BW-NEXT: movl $554172416, %eax # imm = 0x21080000 7844; AVX512DQ-BW-NEXT: kmovd %eax, %k2 7845; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 7846; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 7847; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7848; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} 7849; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] 7850; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842 7851; AVX512DQ-BW-NEXT: kmovd %eax, %k2 7852; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} 7853; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 7854; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] 7855; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7856; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 7857; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 7858; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 7859; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 7860; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 7861; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 7862; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 7863; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} 7864; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7865; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} 7866; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero 7867; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm3 7868; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 7869; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 7870; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7871; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} 7872; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7873; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi) 7874; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) 7875; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) 7876; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) 7877; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) 7878; AVX512DQ-BW-NEXT: vzeroupper 7879; AVX512DQ-BW-NEXT: retq 7880; 7881; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf64: 7882; AVX512DQ-BW-FCP: # %bb.0: 7883; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 7884; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 7885; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 7886; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 7887; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 7888; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 7889; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} 7890; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 7891; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 7892; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 7893; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} 7894; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 7895; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 7896; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} 7897; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 7898; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] 7899; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] 7900; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 7901; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 7902; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 7903; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] 7904; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 7905; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 7906; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} 7907; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 7908; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 7909; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 7910; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} 7911; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] 7912; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 7913; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] 7914; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] 7915; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 7916; AVX512DQ-BW-FCP-NEXT: movl $127, %eax 7917; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 7918; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} 7919; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 7920; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] 7921; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 7922; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero 7923; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 7924; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 7925; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 7926; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} 7927; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 7928; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 7929; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 7930; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} 7931; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 7932; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] 7933; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero 7934; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 7935; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7936; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] 7937; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 7938; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A 7939; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 7940; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} 7941; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7942; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 7943; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 7944; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7945; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 7946; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] 7947; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 7948; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] 7949; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 7950; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] 7951; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} 7952; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] 7953; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 7954; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 7955; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} 7956; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] 7957; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 7958; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] 7959; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 7960; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] 7961; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 7962; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7963; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] 7964; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero 7965; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 7966; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7967; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 7968; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} 7969; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 7970; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} 7971; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7972; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] 7973; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero 7974; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 7975; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 7976; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 7977; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 7978; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} 7979; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 7980; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} 7981; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] 7982; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 7983; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 7984; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} 7985; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} 7986; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 7987; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] 7988; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] 7989; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 7990; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] 7991; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} 7992; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] 7993; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 7994; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 7995; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} 7996; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] 7997; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] 7998; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 7999; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 8000; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] 8001; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] 8002; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] 8003; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero 8004; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 8005; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 8006; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 8007; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} 8008; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 8009; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} 8010; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero 8011; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 8012; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] 8013; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 8014; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 8015; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} 8016; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 8017; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} 8018; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] 8019; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 8020; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 8021; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} 8022; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} 8023; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] 8024; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 8025; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] 8026; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 8027; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 8028; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 8029; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] 8030; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} 8031; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] 8032; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 8033; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 8034; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} 8035; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] 8036; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] 8037; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 8038; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 8039; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] 8040; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 8041; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] 8042; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero 8043; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 8044; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 8045; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 8046; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF 8047; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 8048; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} 8049; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 8050; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} 8051; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 8052; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] 8053; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero 8054; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 8055; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8056; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} 8057; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 8058; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} 8059; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 8060; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] 8061; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] 8062; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 8063; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 8064; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 8065; AVX512DQ-BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000 8066; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 8067; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 8068; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] 8069; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 8070; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} 8071; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] 8072; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 8073; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 8074; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} 8075; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 8076; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] 8077; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 8078; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 8079; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] 8080; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 8081; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 8082; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] 8083; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 8084; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 8085; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} 8086; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8087; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} 8088; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero 8089; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 8090; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] 8091; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 8092; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8093; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} 8094; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8095; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) 8096; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 8097; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 8098; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) 8099; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 8100; AVX512DQ-BW-FCP-NEXT: vzeroupper 8101; AVX512DQ-BW-FCP-NEXT: retq 8102 %wide.vec = load <320 x i8>, ptr %in.vec, align 64 8103 %strided.vec0 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315> 8104 %strided.vec1 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316> 8105 %strided.vec2 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317> 8106 %strided.vec3 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318> 8107 %strided.vec4 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319> 8108 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64 8109 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64 8110 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64 8111 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64 8112 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64 8113 ret void 8114} 8115