1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 19; SSE-LABEL: load_i16_stride6_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movdqa (%rdi), %xmm0 23; SSE-NEXT: movdqa 16(%rdi), %xmm1 24; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 25; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 26; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 27; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 28; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 29; SSE-NEXT: movdqa %xmm0, %xmm5 30; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 31; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 32; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 33; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] 34; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] 35; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 36; SSE-NEXT: psrlq $48, %xmm1 37; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 38; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 39; SSE-NEXT: movd %xmm3, (%rsi) 40; SSE-NEXT: movd %xmm2, (%rdx) 41; SSE-NEXT: movd %xmm4, (%rcx) 42; SSE-NEXT: movd %xmm5, (%r8) 43; SSE-NEXT: movd %xmm7, (%r9) 44; SSE-NEXT: movd %xmm0, (%rax) 45; SSE-NEXT: retq 46; 47; AVX-LABEL: load_i16_stride6_vf2: 48; AVX: # %bb.0: 49; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 50; AVX-NEXT: vmovdqa (%rdi), %xmm0 51; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 52; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 53; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 54; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 55; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 56; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 57; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 58; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 59; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 60; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] 61; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] 62; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 63; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 64; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 65; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 66; AVX-NEXT: vmovd %xmm3, (%rsi) 67; AVX-NEXT: vmovd %xmm2, (%rdx) 68; AVX-NEXT: vmovd %xmm4, (%rcx) 69; AVX-NEXT: vmovd %xmm5, (%r8) 70; AVX-NEXT: vmovd %xmm6, (%r9) 71; AVX-NEXT: vmovd %xmm0, (%rax) 72; AVX-NEXT: retq 73; 74; AVX2-LABEL: load_i16_stride6_vf2: 75; AVX2: # %bb.0: 76; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 77; AVX2-NEXT: vmovdqa (%rdi), %xmm0 78; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 79; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 80; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 81; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 82; AVX2-NEXT: vpbroadcastw 4(%rdi), %xmm4 83; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 84; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 85; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 86; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 87; AVX2-NEXT: vpbroadcastw 20(%rdi), %xmm6 88; AVX2-NEXT: vpbroadcastw 8(%rdi), %xmm7 89; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 90; AVX2-NEXT: vpsrlq $48, %xmm1, %xmm1 91; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 92; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 93; AVX2-NEXT: vmovd %xmm3, (%rsi) 94; AVX2-NEXT: vmovd %xmm2, (%rdx) 95; AVX2-NEXT: vmovd %xmm4, (%rcx) 96; AVX2-NEXT: vmovd %xmm5, (%r8) 97; AVX2-NEXT: vmovd %xmm6, (%r9) 98; AVX2-NEXT: vmovd %xmm0, (%rax) 99; AVX2-NEXT: retq 100; 101; AVX2-FP-LABEL: load_i16_stride6_vf2: 102; AVX2-FP: # %bb.0: 103; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 104; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 105; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 106; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 107; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 108; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 109; AVX2-FP-NEXT: vpbroadcastw 4(%rdi), %xmm4 110; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 111; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 112; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u] 113; AVX2-FP-NEXT: vpbroadcastw 20(%rdi), %xmm6 114; AVX2-FP-NEXT: vpbroadcastw 8(%rdi), %xmm7 115; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 116; AVX2-FP-NEXT: vpsrlq $48, %xmm1, %xmm1 117; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 118; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 119; AVX2-FP-NEXT: vmovd %xmm3, (%rsi) 120; AVX2-FP-NEXT: vmovd %xmm2, (%rdx) 121; AVX2-FP-NEXT: vmovd %xmm4, (%rcx) 122; AVX2-FP-NEXT: vmovd %xmm5, (%r8) 123; AVX2-FP-NEXT: vmovd %xmm6, (%r9) 124; AVX2-FP-NEXT: vmovd %xmm0, (%rax) 125; AVX2-FP-NEXT: retq 126; 127; AVX2-FCP-LABEL: load_i16_stride6_vf2: 128; AVX2-FCP: # %bb.0: 129; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 130; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 131; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 132; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 133; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 134; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 135; AVX2-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 136; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 137; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 138; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u] 139; AVX2-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 140; AVX2-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 141; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 142; AVX2-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 143; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 144; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 145; AVX2-FCP-NEXT: vmovd %xmm3, (%rsi) 146; AVX2-FCP-NEXT: vmovd %xmm2, (%rdx) 147; AVX2-FCP-NEXT: vmovd %xmm4, (%rcx) 148; AVX2-FCP-NEXT: vmovd %xmm5, (%r8) 149; AVX2-FCP-NEXT: vmovd %xmm6, (%r9) 150; AVX2-FCP-NEXT: vmovd %xmm0, (%rax) 151; AVX2-FCP-NEXT: retq 152; 153; AVX512-LABEL: load_i16_stride6_vf2: 154; AVX512: # %bb.0: 155; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 156; AVX512-NEXT: vmovdqa (%rdi), %xmm0 157; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 158; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 159; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 160; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 161; AVX512-NEXT: vpbroadcastw 4(%rdi), %xmm4 162; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 163; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 164; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 165; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 166; AVX512-NEXT: vpbroadcastw 20(%rdi), %xmm6 167; AVX512-NEXT: vpbroadcastw 8(%rdi), %xmm7 168; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 169; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm1 170; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 171; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 172; AVX512-NEXT: vmovd %xmm3, (%rsi) 173; AVX512-NEXT: vmovd %xmm2, (%rdx) 174; AVX512-NEXT: vmovd %xmm4, (%rcx) 175; AVX512-NEXT: vmovd %xmm5, (%r8) 176; AVX512-NEXT: vmovd %xmm6, (%r9) 177; AVX512-NEXT: vmovd %xmm0, (%rax) 178; AVX512-NEXT: retq 179; 180; AVX512-FCP-LABEL: load_i16_stride6_vf2: 181; AVX512-FCP: # %bb.0: 182; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 183; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 184; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 185; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 186; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 187; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 188; AVX512-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 189; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 190; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 191; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u] 192; AVX512-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 193; AVX512-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 194; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 195; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 196; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 197; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 198; AVX512-FCP-NEXT: vmovd %xmm3, (%rsi) 199; AVX512-FCP-NEXT: vmovd %xmm2, (%rdx) 200; AVX512-FCP-NEXT: vmovd %xmm4, (%rcx) 201; AVX512-FCP-NEXT: vmovd %xmm5, (%r8) 202; AVX512-FCP-NEXT: vmovd %xmm6, (%r9) 203; AVX512-FCP-NEXT: vmovd %xmm0, (%rax) 204; AVX512-FCP-NEXT: retq 205; 206; AVX512DQ-LABEL: load_i16_stride6_vf2: 207; AVX512DQ: # %bb.0: 208; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 209; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 210; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 211; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 212; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 213; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 214; AVX512DQ-NEXT: vpbroadcastw 4(%rdi), %xmm4 215; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 216; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 217; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 218; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 219; AVX512DQ-NEXT: vpbroadcastw 20(%rdi), %xmm6 220; AVX512DQ-NEXT: vpbroadcastw 8(%rdi), %xmm7 221; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 222; AVX512DQ-NEXT: vpsrlq $48, %xmm1, %xmm1 223; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 224; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 225; AVX512DQ-NEXT: vmovd %xmm3, (%rsi) 226; AVX512DQ-NEXT: vmovd %xmm2, (%rdx) 227; AVX512DQ-NEXT: vmovd %xmm4, (%rcx) 228; AVX512DQ-NEXT: vmovd %xmm5, (%r8) 229; AVX512DQ-NEXT: vmovd %xmm6, (%r9) 230; AVX512DQ-NEXT: vmovd %xmm0, (%rax) 231; AVX512DQ-NEXT: retq 232; 233; AVX512DQ-FCP-LABEL: load_i16_stride6_vf2: 234; AVX512DQ-FCP: # %bb.0: 235; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 236; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 237; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 238; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 239; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 240; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 241; AVX512DQ-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 242; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 243; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 244; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u] 245; AVX512DQ-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 246; AVX512DQ-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 247; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 248; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 249; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 250; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 251; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rsi) 252; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rdx) 253; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rcx) 254; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%r8) 255; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r9) 256; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax) 257; AVX512DQ-FCP-NEXT: retq 258; 259; AVX512BW-LABEL: load_i16_stride6_vf2: 260; AVX512BW: # %bb.0: 261; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 262; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 263; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 264; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 265; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 266; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 267; AVX512BW-NEXT: vpbroadcastw 4(%rdi), %xmm4 268; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 269; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 270; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 271; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 272; AVX512BW-NEXT: vpbroadcastw 20(%rdi), %xmm6 273; AVX512BW-NEXT: vpbroadcastw 8(%rdi), %xmm7 274; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 275; AVX512BW-NEXT: vpsrlq $48, %xmm1, %xmm1 276; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 277; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 278; AVX512BW-NEXT: vmovd %xmm3, (%rsi) 279; AVX512BW-NEXT: vmovd %xmm2, (%rdx) 280; AVX512BW-NEXT: vmovd %xmm4, (%rcx) 281; AVX512BW-NEXT: vmovd %xmm5, (%r8) 282; AVX512BW-NEXT: vmovd %xmm6, (%r9) 283; AVX512BW-NEXT: vmovd %xmm0, (%rax) 284; AVX512BW-NEXT: retq 285; 286; AVX512BW-FCP-LABEL: load_i16_stride6_vf2: 287; AVX512BW-FCP: # %bb.0: 288; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 289; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 290; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 291; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 292; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 293; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 294; AVX512BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 295; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 296; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] 297; AVX512BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 298; AVX512BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 299; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 300; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 301; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 302; AVX512BW-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 303; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 304; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rsi) 305; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rdx) 306; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rcx) 307; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) 308; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) 309; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) 310; AVX512BW-FCP-NEXT: vzeroupper 311; AVX512BW-FCP-NEXT: retq 312; 313; AVX512DQ-BW-LABEL: load_i16_stride6_vf2: 314; AVX512DQ-BW: # %bb.0: 315; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 316; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 317; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 318; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 319; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 320; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 321; AVX512DQ-BW-NEXT: vpbroadcastw 4(%rdi), %xmm4 322; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 323; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 324; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 325; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 326; AVX512DQ-BW-NEXT: vpbroadcastw 20(%rdi), %xmm6 327; AVX512DQ-BW-NEXT: vpbroadcastw 8(%rdi), %xmm7 328; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 329; AVX512DQ-BW-NEXT: vpsrlq $48, %xmm1, %xmm1 330; AVX512DQ-BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 331; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 332; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rsi) 333; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rdx) 334; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rcx) 335; AVX512DQ-BW-NEXT: vmovd %xmm5, (%r8) 336; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r9) 337; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax) 338; AVX512DQ-BW-NEXT: retq 339; 340; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf2: 341; AVX512DQ-BW-FCP: # %bb.0: 342; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 343; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 344; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 345; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 346; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] 347; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 348; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 349; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 350; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] 351; AVX512DQ-BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 352; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 353; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 354; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 355; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 356; AVX512DQ-BW-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 357; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 358; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rsi) 359; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rdx) 360; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rcx) 361; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) 362; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) 363; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) 364; AVX512DQ-BW-FCP-NEXT: vzeroupper 365; AVX512DQ-BW-FCP-NEXT: retq 366 %wide.vec = load <12 x i16>, ptr %in.vec, align 64 367 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6> 368 %strided.vec1 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 1, i32 7> 369 %strided.vec2 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 2, i32 8> 370 %strided.vec3 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 3, i32 9> 371 %strided.vec4 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 4, i32 10> 372 %strided.vec5 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11> 373 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64 374 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64 375 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64 376 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64 377 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64 378 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64 379 ret void 380} 381 382define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 383; SSE-LABEL: load_i16_stride6_vf4: 384; SSE: # %bb.0: 385; SSE-NEXT: movdqa (%rdi), %xmm0 386; SSE-NEXT: movdqa 16(%rdi), %xmm1 387; SSE-NEXT: movdqa 32(%rdi), %xmm5 388; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] 389; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] 390; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] 391; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 392; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] 393; SSE-NEXT: movdqa %xmm2, %xmm6 394; SSE-NEXT: pandn %xmm5, %xmm6 395; SSE-NEXT: movdqa %xmm1, %xmm7 396; SSE-NEXT: psrld $16, %xmm7 397; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] 398; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] 399; SSE-NEXT: pand %xmm2, %xmm3 400; SSE-NEXT: por %xmm6, %xmm3 401; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] 402; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] 403; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 404; SSE-NEXT: movdqa %xmm2, %xmm8 405; SSE-NEXT: pandn %xmm5, %xmm8 406; SSE-NEXT: movdqa %xmm0, %xmm5 407; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] 408; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] 409; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] 410; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] 411; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] 412; SSE-NEXT: pand %xmm2, %xmm9 413; SSE-NEXT: por %xmm8, %xmm9 414; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 415; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 416; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 417; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] 418; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 419; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] 420; SSE-NEXT: pand %xmm2, %xmm5 421; SSE-NEXT: pandn %xmm6, %xmm2 422; SSE-NEXT: por %xmm5, %xmm2 423; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 424; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] 425; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 426; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] 427; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 428; SSE-NEXT: psrlq $48, %xmm1 429; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 430; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 431; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] 432; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 433; SSE-NEXT: movq %xmm4, (%rsi) 434; SSE-NEXT: movq %xmm3, (%rdx) 435; SSE-NEXT: movq %xmm9, (%rcx) 436; SSE-NEXT: movq %xmm2, (%r8) 437; SSE-NEXT: movq %xmm6, (%r9) 438; SSE-NEXT: movq %xmm0, (%rax) 439; SSE-NEXT: retq 440; 441; AVX-LABEL: load_i16_stride6_vf4: 442; AVX: # %bb.0: 443; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 444; AVX-NEXT: vmovdqa (%rdi), %xmm0 445; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 446; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 447; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] 448; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] 449; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] 450; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 451; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 452; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 453; AVX-NEXT: vpsrld $16, %xmm1, %xmm5 454; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] 455; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 456; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] 457; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 458; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 459; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] 460; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7] 461; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] 462; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 463; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] 464; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 465; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 466; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 467; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] 468; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 469; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 470; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 471; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 472; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7] 473; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 474; AVX-NEXT: vmovq %xmm4, (%rsi) 475; AVX-NEXT: vmovq %xmm3, (%rdx) 476; AVX-NEXT: vmovq %xmm5, (%rcx) 477; AVX-NEXT: vmovq %xmm6, (%r8) 478; AVX-NEXT: vmovq %xmm7, (%r9) 479; AVX-NEXT: vmovq %xmm0, (%rax) 480; AVX-NEXT: retq 481; 482; AVX2-LABEL: load_i16_stride6_vf4: 483; AVX2: # %bb.0: 484; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 485; AVX2-NEXT: vmovdqa (%rdi), %xmm0 486; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 487; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 488; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 489; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 490; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 491; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4 492; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] 493; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] 494; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 495; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] 496; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] 497; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] 498; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 499; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 500; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] 501; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 502; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 503; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 504; AVX2-NEXT: vmovq %xmm3, (%rsi) 505; AVX2-NEXT: vmovq %xmm4, (%rdx) 506; AVX2-NEXT: vmovq %xmm6, (%rcx) 507; AVX2-NEXT: vmovq %xmm5, (%r8) 508; AVX2-NEXT: vmovq %xmm1, (%r9) 509; AVX2-NEXT: vmovq %xmm0, (%rax) 510; AVX2-NEXT: retq 511; 512; AVX2-FP-LABEL: load_i16_stride6_vf4: 513; AVX2-FP: # %bb.0: 514; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 515; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 516; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 517; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 518; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 519; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 520; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 521; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4 522; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] 523; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 524; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] 525; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] 526; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] 527; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 528; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 529; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] 530; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 531; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 532; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 533; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) 534; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) 535; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) 536; AVX2-FP-NEXT: vmovq %xmm5, (%r8) 537; AVX2-FP-NEXT: vmovq %xmm1, (%r9) 538; AVX2-FP-NEXT: vmovq %xmm0, (%rax) 539; AVX2-FP-NEXT: retq 540; 541; AVX2-FCP-LABEL: load_i16_stride6_vf4: 542; AVX2-FCP: # %bb.0: 543; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 544; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 545; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 546; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 547; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 548; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 549; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 550; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4 551; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] 552; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 553; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] 554; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] 555; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] 556; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 557; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 558; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] 559; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 560; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 561; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 562; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) 563; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) 564; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) 565; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) 566; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) 567; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) 568; AVX2-FCP-NEXT: retq 569; 570; AVX512-LABEL: load_i16_stride6_vf4: 571; AVX512: # %bb.0: 572; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 573; AVX512-NEXT: vmovdqa (%rdi), %xmm0 574; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 575; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 576; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 577; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 578; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 579; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 580; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 581; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] 582; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 583; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 584; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] 585; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1 586; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 587; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 588; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] 589; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 590; AVX512-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 591; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 592; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 593; AVX512-NEXT: vmovq %xmm3, (%rsi) 594; AVX512-NEXT: vmovq %xmm0, (%rdx) 595; AVX512-NEXT: vmovq %xmm2, (%rcx) 596; AVX512-NEXT: vmovq %xmm1, (%r8) 597; AVX512-NEXT: vmovq %xmm4, (%r9) 598; AVX512-NEXT: vmovq %xmm5, (%rax) 599; AVX512-NEXT: vzeroupper 600; AVX512-NEXT: retq 601; 602; AVX512-FCP-LABEL: load_i16_stride6_vf4: 603; AVX512-FCP: # %bb.0: 604; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 605; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 606; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 607; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 608; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 609; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 610; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 611; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 612; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] 613; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 614; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 615; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] 616; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 617; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 618; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 619; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] 620; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 621; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 622; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 623; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 624; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) 625; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) 626; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) 627; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) 628; AVX512-FCP-NEXT: vmovq %xmm4, (%r9) 629; AVX512-FCP-NEXT: vmovq %xmm5, (%rax) 630; AVX512-FCP-NEXT: vzeroupper 631; AVX512-FCP-NEXT: retq 632; 633; AVX512DQ-LABEL: load_i16_stride6_vf4: 634; AVX512DQ: # %bb.0: 635; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 636; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 637; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 638; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 639; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 640; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 641; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 642; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 643; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 644; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] 645; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 646; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 647; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] 648; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1 649; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 650; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 651; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] 652; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 653; AVX512DQ-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 654; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 655; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 656; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) 657; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) 658; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) 659; AVX512DQ-NEXT: vmovq %xmm1, (%r8) 660; AVX512DQ-NEXT: vmovq %xmm4, (%r9) 661; AVX512DQ-NEXT: vmovq %xmm5, (%rax) 662; AVX512DQ-NEXT: vzeroupper 663; AVX512DQ-NEXT: retq 664; 665; AVX512DQ-FCP-LABEL: load_i16_stride6_vf4: 666; AVX512DQ-FCP: # %bb.0: 667; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 668; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 669; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 670; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 671; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 672; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 673; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] 674; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 675; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] 676; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 677; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 678; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] 679; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 680; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 681; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] 682; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] 683; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 684; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 685; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] 686; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] 687; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) 688; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) 689; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) 690; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) 691; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r9) 692; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rax) 693; AVX512DQ-FCP-NEXT: vzeroupper 694; AVX512DQ-FCP-NEXT: retq 695; 696; AVX512BW-LABEL: load_i16_stride6_vf4: 697; AVX512BW: # %bb.0: 698; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 699; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] 700; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 701; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 702; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] 703; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 704; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] 705; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 706; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] 707; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 708; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] 709; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 710; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] 711; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 712; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 713; AVX512BW-NEXT: vmovq %xmm2, (%rdx) 714; AVX512BW-NEXT: vmovq %xmm3, (%rcx) 715; AVX512BW-NEXT: vmovq %xmm4, (%r8) 716; AVX512BW-NEXT: vmovq %xmm5, (%r9) 717; AVX512BW-NEXT: vmovq %xmm1, (%rax) 718; AVX512BW-NEXT: vzeroupper 719; AVX512BW-NEXT: retq 720; 721; AVX512BW-FCP-LABEL: load_i16_stride6_vf4: 722; AVX512BW-FCP: # %bb.0: 723; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 724; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] 725; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 726; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 727; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] 728; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 729; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] 730; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 731; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] 732; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 733; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] 734; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 735; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] 736; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 737; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) 738; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) 739; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) 740; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) 741; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) 742; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) 743; AVX512BW-FCP-NEXT: vzeroupper 744; AVX512BW-FCP-NEXT: retq 745; 746; AVX512DQ-BW-LABEL: load_i16_stride6_vf4: 747; AVX512DQ-BW: # %bb.0: 748; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 749; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] 750; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 751; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 752; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] 753; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 754; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] 755; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 756; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] 757; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 758; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] 759; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 760; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] 761; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 762; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) 763; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) 764; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) 765; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) 766; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) 767; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) 768; AVX512DQ-BW-NEXT: vzeroupper 769; AVX512DQ-BW-NEXT: retq 770; 771; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf4: 772; AVX512DQ-BW-FCP: # %bb.0: 773; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 774; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] 775; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 776; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 777; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] 778; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 779; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] 780; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 781; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] 782; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 783; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] 784; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 785; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] 786; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 787; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) 788; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) 789; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) 790; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) 791; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) 792; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) 793; AVX512DQ-BW-FCP-NEXT: vzeroupper 794; AVX512DQ-BW-FCP-NEXT: retq 795 %wide.vec = load <24 x i16>, ptr %in.vec, align 64 796 %strided.vec0 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18> 797 %strided.vec1 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19> 798 %strided.vec2 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20> 799 %strided.vec3 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21> 800 %strided.vec4 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22> 801 %strided.vec5 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23> 802 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64 803 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64 804 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64 805 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64 806 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64 807 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64 808 ret void 809} 810 811define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 812; SSE-LABEL: load_i16_stride6_vf8: 813; SSE: # %bb.0: 814; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 815; SSE-NEXT: movdqa 64(%rdi), %xmm1 816; SSE-NEXT: movdqa 80(%rdi), %xmm8 817; SSE-NEXT: movdqa (%rdi), %xmm3 818; SSE-NEXT: movdqa 16(%rdi), %xmm5 819; SSE-NEXT: movdqa 32(%rdi), %xmm6 820; SSE-NEXT: movdqa 48(%rdi), %xmm4 821; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7] 822; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] 823; SSE-NEXT: movdqa %xmm0, %xmm9 824; SSE-NEXT: pandn %xmm2, %xmm9 825; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] 826; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] 827; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] 828; SSE-NEXT: pand %xmm0, %xmm2 829; SSE-NEXT: por %xmm9, %xmm2 830; SSE-NEXT: movdqa %xmm1, %xmm9 831; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] 832; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] 833; SSE-NEXT: movdqa %xmm8, %xmm12 834; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0] 835; SSE-NEXT: movaps %xmm1, %xmm10 836; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] 837; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] 838; SSE-NEXT: pslld $16, %xmm8 839; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 840; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 841; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] 842; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7] 843; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3] 844; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] 845; SSE-NEXT: movdqa %xmm5, %xmm9 846; SSE-NEXT: psrld $16, %xmm9 847; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] 848; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] 849; SSE-NEXT: pand %xmm0, %xmm7 850; SSE-NEXT: pandn %xmm6, %xmm0 851; SSE-NEXT: por %xmm7, %xmm0 852; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] 853; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm7[1,3] 854; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0] 855; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] 856; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] 857; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3] 858; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 859; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] 860; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] 861; SSE-NEXT: movdqa %xmm11, %xmm8 862; SSE-NEXT: pandn %xmm6, %xmm8 863; SSE-NEXT: movdqa %xmm3, %xmm13 864; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0] 865; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3] 866; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7] 867; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] 868; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7] 869; SSE-NEXT: pand %xmm11, %xmm14 870; SSE-NEXT: por %xmm8, %xmm14 871; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] 872; SSE-NEXT: pand %xmm6, %xmm14 873; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2] 874; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] 875; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 876; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4] 877; SSE-NEXT: movdqa %xmm6, %xmm8 878; SSE-NEXT: pandn %xmm12, %xmm8 879; SSE-NEXT: por %xmm14, %xmm8 880; SSE-NEXT: movdqa %xmm4, %xmm12 881; SSE-NEXT: psrlq $48, %xmm12 882; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0] 883; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm13[3,1,2,3,4,5,6,7] 884; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] 885; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] 886; SSE-NEXT: pand %xmm11, %xmm12 887; SSE-NEXT: pandn %xmm9, %xmm11 888; SSE-NEXT: por %xmm12, %xmm11 889; SSE-NEXT: pand %xmm6, %xmm11 890; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] 891; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2] 892; SSE-NEXT: movdqa %xmm6, %xmm9 893; SSE-NEXT: pandn %xmm10, %xmm9 894; SSE-NEXT: por %xmm11, %xmm9 895; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] 896; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] 897; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 898; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6] 899; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] 900; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] 901; SSE-NEXT: andps %xmm6, %xmm10 902; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7] 903; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] 904; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] 905; SSE-NEXT: movdqa %xmm6, %xmm12 906; SSE-NEXT: pandn %xmm11, %xmm12 907; SSE-NEXT: por %xmm10, %xmm12 908; SSE-NEXT: psrlq $48, %xmm5 909; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 910; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 911; SSE-NEXT: psrld $16, %xmm4 912; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] 913; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] 914; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 915; SSE-NEXT: andps %xmm6, %xmm5 916; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 917; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 918; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] 919; SSE-NEXT: pandn %xmm1, %xmm6 920; SSE-NEXT: por %xmm5, %xmm6 921; SSE-NEXT: movaps %xmm2, (%rsi) 922; SSE-NEXT: movaps %xmm0, (%rdx) 923; SSE-NEXT: movdqa %xmm8, (%rcx) 924; SSE-NEXT: movdqa %xmm9, (%r8) 925; SSE-NEXT: movdqa %xmm12, (%r9) 926; SSE-NEXT: movdqa %xmm6, (%rax) 927; SSE-NEXT: retq 928; 929; AVX-LABEL: load_i16_stride6_vf8: 930; AVX: # %bb.0: 931; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 932; AVX-NEXT: vmovdqa (%rdi), %xmm0 933; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 934; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 935; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 936; AVX-NEXT: vpsrlq $16, %xmm4, %xmm3 937; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] 938; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7] 939; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 940; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] 941; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7] 942; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] 943; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] 944; AVX-NEXT: vmovdqa 80(%rdi), %xmm5 945; AVX-NEXT: vpslld $16, %xmm5, %xmm9 946; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 947; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 948; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 949; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] 950; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] 951; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] 952; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] 953; AVX-NEXT: vpsrld $16, %xmm2, %xmm9 954; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 955; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] 956; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] 957; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] 958; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] 959; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] 960; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] 961; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 962; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] 963; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] 964; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] 965; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] 966; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7] 967; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 968; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7] 969; AVX-NEXT: vpsrlq $48, %xmm1, %xmm11 970; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3] 971; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0] 972; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u] 973; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7] 974; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 975; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] 976; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] 977; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] 978; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 979; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 980; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6] 981; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] 982; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] 983; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] 984; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 985; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] 986; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2 987; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 988; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 989; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 990; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] 991; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] 992; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 993; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 994; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 995; AVX-NEXT: vmovdqa %xmm3, (%rsi) 996; AVX-NEXT: vmovdqa %xmm7, (%rdx) 997; AVX-NEXT: vmovdqa %xmm8, (%rcx) 998; AVX-NEXT: vmovdqa %xmm9, (%r8) 999; AVX-NEXT: vmovdqa %xmm6, (%r9) 1000; AVX-NEXT: vmovdqa %xmm0, (%rax) 1001; AVX-NEXT: retq 1002; 1003; AVX2-LABEL: load_i16_stride6_vf8: 1004; AVX2: # %bb.0: 1005; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1006; AVX2-NEXT: vmovdqa (%rdi), %ymm3 1007; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 1008; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 1009; AVX2-NEXT: vpslld $16, %xmm0, %xmm2 1010; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 1011; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1012; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 1013; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 1014; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 1015; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 1016; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] 1017; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] 1018; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] 1019; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] 1020; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 1021; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm7 1022; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 1023; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] 1024; AVX2-NEXT: vpbroadcastw 74(%rdi), %xmm6 1025; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] 1026; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1027; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1028; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1029; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1030; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 1031; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1032; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7] 1033; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] 1034; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1035; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] 1036; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1037; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1038; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1039; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 1040; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 1041; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1042; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1043; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1044; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1045; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1046; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] 1047; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1048; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1049; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1050; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1051; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] 1052; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] 1053; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 1054; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 1055; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 1056; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1057; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1058; AVX2-NEXT: vmovdqa %xmm2, (%rsi) 1059; AVX2-NEXT: vmovdqa %xmm5, (%rdx) 1060; AVX2-NEXT: vmovdqa %xmm8, (%rcx) 1061; AVX2-NEXT: vmovdqa %xmm6, (%r8) 1062; AVX2-NEXT: vmovdqa %xmm1, (%r9) 1063; AVX2-NEXT: vmovdqa %xmm0, (%rax) 1064; AVX2-NEXT: vzeroupper 1065; AVX2-NEXT: retq 1066; 1067; AVX2-FP-LABEL: load_i16_stride6_vf8: 1068; AVX2-FP: # %bb.0: 1069; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1070; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1071; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 1072; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 1073; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 1074; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 1075; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm4 1076; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] 1077; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1078; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] 1079; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3 1080; AVX2-FP-NEXT: vpslld $16, %xmm3, %xmm7 1081; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 1082; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1083; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1084; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] 1085; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 1086; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 1087; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 1088; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] 1089; AVX2-FP-NEXT: vpbroadcastw 74(%rdi), %xmm6 1090; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 1091; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1092; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 1093; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1094; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1095; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6 1096; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1097; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 1098; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1099; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] 1100; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1101; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1102; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1103; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 1104; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1105; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1106; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1107; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 1108; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 1109; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] 1110; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1111; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1112; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 1113; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1114; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] 1115; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 1116; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 1117; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 1118; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 1119; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1120; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 1121; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) 1122; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) 1123; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) 1124; AVX2-FP-NEXT: vmovdqa %xmm6, (%r8) 1125; AVX2-FP-NEXT: vmovdqa %xmm4, (%r9) 1126; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 1127; AVX2-FP-NEXT: vzeroupper 1128; AVX2-FP-NEXT: retq 1129; 1130; AVX2-FCP-LABEL: load_i16_stride6_vf8: 1131; AVX2-FCP: # %bb.0: 1132; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1133; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1134; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 1135; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 1136; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 1137; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 1138; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 1139; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] 1140; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1141; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] 1142; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 1143; AVX2-FCP-NEXT: vpslld $16, %xmm3, %xmm7 1144; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 1145; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1146; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1147; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] 1148; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 1149; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 1150; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 1151; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] 1152; AVX2-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6 1153; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 1154; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1155; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 1156; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1157; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1158; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 1159; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1160; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 1161; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1162; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] 1163; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1164; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1165; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1166; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 1167; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1168; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1169; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1170; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 1171; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 1172; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] 1173; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1174; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1175; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 1176; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1177; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] 1178; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 1179; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 1180; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 1181; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 1182; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1183; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 1184; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) 1185; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1186; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) 1187; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) 1188; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r9) 1189; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 1190; AVX2-FCP-NEXT: vzeroupper 1191; AVX2-FCP-NEXT: retq 1192; 1193; AVX512-LABEL: load_i16_stride6_vf8: 1194; AVX512: # %bb.0: 1195; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1196; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 1197; AVX512-NEXT: vpslld $16, %xmm0, %xmm2 1198; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 1199; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1200; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1201; AVX512-NEXT: vmovdqa (%rdi), %ymm3 1202; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 1203; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 1204; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] 1205; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7 1206; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] 1207; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] 1208; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] 1209; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] 1210; AVX512-NEXT: vpbroadcastw 74(%rdi), %xmm6 1211; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] 1212; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] 1213; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] 1214; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] 1215; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 1216; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] 1217; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] 1218; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] 1219; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1220; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1221; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1222; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1223; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6 1224; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1225; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7] 1226; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] 1227; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1228; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] 1229; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1230; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1231; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1232; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 1233; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 1234; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1235; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1236; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1237; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1238; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 1239; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] 1240; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1241; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1242; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1243; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1244; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] 1245; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] 1246; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 1247; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 1248; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 1249; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1250; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1251; AVX512-NEXT: vmovdqa %xmm2, (%rsi) 1252; AVX512-NEXT: vmovdqa %xmm5, (%rdx) 1253; AVX512-NEXT: vmovdqa %xmm8, (%rcx) 1254; AVX512-NEXT: vmovdqa %xmm6, (%r8) 1255; AVX512-NEXT: vmovdqa %xmm1, (%r9) 1256; AVX512-NEXT: vmovdqa %xmm0, (%rax) 1257; AVX512-NEXT: vzeroupper 1258; AVX512-NEXT: retq 1259; 1260; AVX512-FCP-LABEL: load_i16_stride6_vf8: 1261; AVX512-FCP: # %bb.0: 1262; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1263; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] 1264; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 1265; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 1266; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1267; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 1268; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 1269; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] 1270; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1271; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] 1272; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 1273; AVX512-FCP-NEXT: vpslld $16, %xmm3, %xmm7 1274; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 1275; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1276; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1277; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] 1278; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 1279; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 1280; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] 1281; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] 1282; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] 1283; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] 1284; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] 1285; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1286; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1287; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1288; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 1289; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1290; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 1291; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1292; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] 1293; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1294; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1295; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1296; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 1297; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1298; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1299; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1300; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 1301; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 1302; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] 1303; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1304; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1305; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 1306; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1307; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] 1308; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 1309; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 1310; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 1311; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] 1312; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1313; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 1314; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) 1315; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1316; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx) 1317; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r8) 1318; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r9) 1319; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax) 1320; AVX512-FCP-NEXT: vzeroupper 1321; AVX512-FCP-NEXT: retq 1322; 1323; AVX512DQ-LABEL: load_i16_stride6_vf8: 1324; AVX512DQ: # %bb.0: 1325; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1326; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 1327; AVX512DQ-NEXT: vpslld $16, %xmm0, %xmm2 1328; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 1329; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1330; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1331; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 1332; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 1333; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 1334; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] 1335; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm7 1336; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] 1337; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] 1338; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] 1339; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] 1340; AVX512DQ-NEXT: vpbroadcastw 74(%rdi), %xmm6 1341; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] 1342; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] 1343; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] 1344; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] 1345; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 1346; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] 1347; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] 1348; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] 1349; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1350; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1351; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1352; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1353; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm6 1354; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1355; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7] 1356; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] 1357; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1358; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] 1359; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1360; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1361; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1362; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 1363; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 1364; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1365; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1366; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1367; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 1368; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 1369; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] 1370; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1371; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1372; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1373; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1374; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] 1375; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] 1376; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 1377; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 1378; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 1379; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1380; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1381; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) 1382; AVX512DQ-NEXT: vmovdqa %xmm5, (%rdx) 1383; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx) 1384; AVX512DQ-NEXT: vmovdqa %xmm6, (%r8) 1385; AVX512DQ-NEXT: vmovdqa %xmm1, (%r9) 1386; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 1387; AVX512DQ-NEXT: vzeroupper 1388; AVX512DQ-NEXT: retq 1389; 1390; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8: 1391; AVX512DQ-FCP: # %bb.0: 1392; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1393; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] 1394; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 1395; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 1396; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1397; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 1398; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 1399; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] 1400; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1401; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] 1402; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 1403; AVX512DQ-FCP-NEXT: vpslld $16, %xmm3, %xmm7 1404; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 1405; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1406; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1407; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] 1408; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 1409; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 1410; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] 1411; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] 1412; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] 1413; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] 1414; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] 1415; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1416; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 1417; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] 1418; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 1419; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 1420; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 1421; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] 1422; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] 1423; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] 1424; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] 1425; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 1426; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 1427; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 1428; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] 1429; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] 1430; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 1431; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 1432; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] 1433; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] 1434; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] 1435; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] 1436; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] 1437; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] 1438; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 1439; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 1440; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 1441; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] 1442; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] 1443; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 1444; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) 1445; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1446; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx) 1447; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r8) 1448; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r9) 1449; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax) 1450; AVX512DQ-FCP-NEXT: vzeroupper 1451; AVX512DQ-FCP-NEXT: retq 1452; 1453; AVX512BW-LABEL: load_i16_stride6_vf8: 1454; AVX512BW: # %bb.0: 1455; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1456; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1457; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1458; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] 1459; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1460; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] 1461; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1462; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] 1463; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1464; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] 1465; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1466; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] 1467; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1468; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] 1469; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 1470; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) 1471; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) 1472; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) 1473; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) 1474; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 1475; AVX512BW-NEXT: vmovdqa %xmm7, (%rax) 1476; AVX512BW-NEXT: vzeroupper 1477; AVX512BW-NEXT: retq 1478; 1479; AVX512BW-FCP-LABEL: load_i16_stride6_vf8: 1480; AVX512BW-FCP: # %bb.0: 1481; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1482; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1483; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1484; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] 1485; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1486; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] 1487; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1488; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] 1489; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1490; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] 1491; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1492; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] 1493; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1494; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] 1495; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 1496; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1497; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1498; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 1499; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 1500; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 1501; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rax) 1502; AVX512BW-FCP-NEXT: vzeroupper 1503; AVX512BW-FCP-NEXT: retq 1504; 1505; AVX512DQ-BW-LABEL: load_i16_stride6_vf8: 1506; AVX512DQ-BW: # %bb.0: 1507; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1508; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1509; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1510; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] 1511; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1512; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] 1513; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1514; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] 1515; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1516; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] 1517; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1518; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] 1519; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1520; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] 1521; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 1522; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) 1523; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) 1524; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) 1525; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) 1526; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 1527; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rax) 1528; AVX512DQ-BW-NEXT: vzeroupper 1529; AVX512DQ-BW-NEXT: retq 1530; 1531; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf8: 1532; AVX512DQ-BW-FCP: # %bb.0: 1533; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1534; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1535; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1536; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] 1537; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1538; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] 1539; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1540; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] 1541; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1542; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] 1543; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1544; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] 1545; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1546; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] 1547; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 1548; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1549; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1550; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 1551; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 1552; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 1553; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rax) 1554; AVX512DQ-BW-FCP-NEXT: vzeroupper 1555; AVX512DQ-BW-FCP-NEXT: retq 1556 %wide.vec = load <48 x i16>, ptr %in.vec, align 64 1557 %strided.vec0 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42> 1558 %strided.vec1 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43> 1559 %strided.vec2 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44> 1560 %strided.vec3 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45> 1561 %strided.vec4 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46> 1562 %strided.vec5 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47> 1563 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64 1564 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64 1565 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64 1566 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64 1567 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64 1568 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64 1569 ret void 1570} 1571 1572define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 1573; SSE-LABEL: load_i16_stride6_vf16: 1574; SSE: # %bb.0: 1575; SSE-NEXT: subq $136, %rsp 1576; SSE-NEXT: movdqa 112(%rdi), %xmm9 1577; SSE-NEXT: movdqa 128(%rdi), %xmm7 1578; SSE-NEXT: movdqa 64(%rdi), %xmm2 1579; SSE-NEXT: movdqa 80(%rdi), %xmm11 1580; SSE-NEXT: movdqa (%rdi), %xmm3 1581; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1582; SSE-NEXT: movdqa 16(%rdi), %xmm6 1583; SSE-NEXT: movdqa 32(%rdi), %xmm0 1584; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1585; SSE-NEXT: movdqa 48(%rdi), %xmm8 1586; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 1587; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] 1588; SSE-NEXT: movdqa %xmm10, %xmm1 1589; SSE-NEXT: pandn %xmm0, %xmm1 1590; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] 1591; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1592; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1593; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] 1594; SSE-NEXT: pand %xmm10, %xmm0 1595; SSE-NEXT: por %xmm1, %xmm0 1596; SSE-NEXT: movdqa %xmm0, %xmm1 1597; SSE-NEXT: movdqa %xmm2, %xmm13 1598; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] 1599; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1600; SSE-NEXT: movdqa %xmm11, %xmm0 1601; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] 1602; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1603; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1604; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] 1605; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] 1606; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1607; SSE-NEXT: pslld $16, %xmm11 1608; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1609; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] 1610; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] 1611; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] 1612; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] 1613; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] 1614; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1615; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] 1616; SSE-NEXT: movdqa %xmm10, %xmm3 1617; SSE-NEXT: pandn %xmm0, %xmm3 1618; SSE-NEXT: movdqa 96(%rdi), %xmm0 1619; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1620; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] 1621; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] 1622; SSE-NEXT: movdqa %xmm9, %xmm11 1623; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1624; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] 1625; SSE-NEXT: pand %xmm10, %xmm2 1626; SSE-NEXT: por %xmm3, %xmm2 1627; SSE-NEXT: movdqa 160(%rdi), %xmm14 1628; SSE-NEXT: movdqa 176(%rdi), %xmm3 1629; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 1630; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1631; SSE-NEXT: movdqa %xmm3, %xmm1 1632; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] 1633; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1634; SSE-NEXT: movdqa %xmm14, %xmm13 1635; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1636; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0] 1637; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3] 1638; SSE-NEXT: pslld $16, %xmm3 1639; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1640; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] 1641; SSE-NEXT: movdqa 144(%rdi), %xmm1 1642; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1643; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 1644; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] 1645; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm9[1,3] 1646; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] 1647; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1648; SSE-NEXT: movdqa %xmm6, %xmm13 1649; SSE-NEXT: psrld $16, %xmm13 1650; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 1651; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] 1652; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] 1653; SSE-NEXT: movdqa %xmm10, %xmm13 1654; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 1655; SSE-NEXT: pandn %xmm15, %xmm13 1656; SSE-NEXT: pand %xmm10, %xmm9 1657; SSE-NEXT: por %xmm13, %xmm9 1658; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] 1659; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] 1660; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] 1661; SSE-NEXT: movdqa %xmm11, %xmm4 1662; SSE-NEXT: psrld $16, %xmm4 1663; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] 1664; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1665; SSE-NEXT: pand %xmm10, %xmm2 1666; SSE-NEXT: movdqa %xmm7, %xmm5 1667; SSE-NEXT: pandn %xmm7, %xmm10 1668; SSE-NEXT: por %xmm2, %xmm10 1669; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1670; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] 1671; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] 1672; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1673; SSE-NEXT: movdqa %xmm15, %xmm1 1674; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1675; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 1676; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1677; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] 1678; SSE-NEXT: movdqa %xmm2, %xmm4 1679; SSE-NEXT: pandn %xmm1, %xmm4 1680; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1681; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1682; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] 1683; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] 1684; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] 1685; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 1686; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] 1687; SSE-NEXT: pand %xmm2, %xmm1 1688; SSE-NEXT: por %xmm4, %xmm1 1689; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1690; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1691; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 1692; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1693; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] 1694; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] 1695; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] 1696; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] 1697; SSE-NEXT: movdqa %xmm12, %xmm0 1698; SSE-NEXT: pandn %xmm4, %xmm0 1699; SSE-NEXT: pand %xmm12, %xmm1 1700; SSE-NEXT: por %xmm1, %xmm0 1701; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1702; SSE-NEXT: movdqa %xmm7, %xmm1 1703; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1704; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1705; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1706; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] 1707; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] 1708; SSE-NEXT: movdqa %xmm2, %xmm4 1709; SSE-NEXT: pandn %xmm1, %xmm4 1710; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 1711; SSE-NEXT: movaps %xmm10, %xmm13 1712; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 1713; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] 1714; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] 1715; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] 1716; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 1717; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7] 1718; SSE-NEXT: pand %xmm2, %xmm0 1719; SSE-NEXT: por %xmm4, %xmm0 1720; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1721; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 1722; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] 1723; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] 1724; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 1725; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4] 1726; SSE-NEXT: movdqa %xmm12, %xmm1 1727; SSE-NEXT: pandn %xmm4, %xmm1 1728; SSE-NEXT: pand %xmm12, %xmm0 1729; SSE-NEXT: por %xmm0, %xmm1 1730; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1731; SSE-NEXT: movdqa %xmm8, %xmm1 1732; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill 1733; SSE-NEXT: movdqa %xmm8, %xmm0 1734; SSE-NEXT: psrlq $48, %xmm0 1735; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1736; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] 1737; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] 1738; SSE-NEXT: movdqa %xmm2, %xmm0 1739; SSE-NEXT: pandn %xmm4, %xmm0 1740; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 1741; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 1742; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 1743; SSE-NEXT: pand %xmm2, %xmm3 1744; SSE-NEXT: por %xmm0, %xmm3 1745; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1746; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,7] 1747; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 1748; SSE-NEXT: movdqa %xmm12, %xmm11 1749; SSE-NEXT: pandn %xmm0, %xmm11 1750; SSE-NEXT: pand %xmm12, %xmm3 1751; SSE-NEXT: por %xmm3, %xmm11 1752; SSE-NEXT: movdqa %xmm7, %xmm4 1753; SSE-NEXT: movdqa %xmm7, %xmm0 1754; SSE-NEXT: psrlq $48, %xmm0 1755; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] 1756; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] 1757; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] 1758; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1759; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 1760; SSE-NEXT: pand %xmm2, %xmm0 1761; SSE-NEXT: pandn %xmm3, %xmm2 1762; SSE-NEXT: por %xmm0, %xmm2 1763; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7] 1764; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 1765; SSE-NEXT: movdqa %xmm12, %xmm7 1766; SSE-NEXT: pandn %xmm0, %xmm7 1767; SSE-NEXT: pand %xmm12, %xmm2 1768; SSE-NEXT: por %xmm2, %xmm7 1769; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1770; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 1771; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1772; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] 1773; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 1774; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3] 1775; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6] 1776; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] 1777; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] 1778; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1779; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] 1780; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 1781; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] 1782; SSE-NEXT: movdqa %xmm12, %xmm1 1783; SSE-NEXT: pandn %xmm2, %xmm1 1784; SSE-NEXT: andps %xmm12, %xmm3 1785; SSE-NEXT: por %xmm3, %xmm1 1786; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] 1787; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] 1788; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1789; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1790; SSE-NEXT: # xmm0 = mem[0,1,0,3] 1791; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1792; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] 1793; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] 1794; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 1795; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] 1796; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 1797; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 1798; SSE-NEXT: movdqa %xmm12, %xmm4 1799; SSE-NEXT: pandn %xmm3, %xmm4 1800; SSE-NEXT: andps %xmm12, %xmm2 1801; SSE-NEXT: por %xmm2, %xmm4 1802; SSE-NEXT: psrlq $48, %xmm5 1803; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1804; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1805; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1806; SSE-NEXT: psrld $16, %xmm0 1807; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7] 1808; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 1809; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] 1810; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] 1811; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 1812; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] 1813; SSE-NEXT: movdqa %xmm12, %xmm10 1814; SSE-NEXT: pandn %xmm3, %xmm10 1815; SSE-NEXT: andps %xmm12, %xmm2 1816; SSE-NEXT: por %xmm2, %xmm10 1817; SSE-NEXT: psrlq $48, %xmm15 1818; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1819; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1820; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] 1821; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 1822; SSE-NEXT: psrld $16, %xmm3 1823; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 1824; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] 1825; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] 1826; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 1827; SSE-NEXT: andps %xmm12, %xmm2 1828; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] 1829; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 1830; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] 1831; SSE-NEXT: pandn %xmm3, %xmm12 1832; SSE-NEXT: por %xmm2, %xmm12 1833; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1834; SSE-NEXT: movaps %xmm0, 16(%rsi) 1835; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1836; SSE-NEXT: movaps %xmm0, (%rsi) 1837; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1838; SSE-NEXT: movaps %xmm0, 16(%rdx) 1839; SSE-NEXT: movaps %xmm9, (%rdx) 1840; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1841; SSE-NEXT: movaps %xmm0, 16(%rcx) 1842; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1843; SSE-NEXT: movaps %xmm0, (%rcx) 1844; SSE-NEXT: movdqa %xmm7, 16(%r8) 1845; SSE-NEXT: movdqa %xmm11, (%r8) 1846; SSE-NEXT: movdqa %xmm4, 16(%r9) 1847; SSE-NEXT: movdqa %xmm1, (%r9) 1848; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1849; SSE-NEXT: movdqa %xmm12, 16(%rax) 1850; SSE-NEXT: movdqa %xmm10, (%rax) 1851; SSE-NEXT: addq $136, %rsp 1852; SSE-NEXT: retq 1853; 1854; AVX-LABEL: load_i16_stride6_vf16: 1855; AVX: # %bb.0: 1856; AVX-NEXT: subq $88, %rsp 1857; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 1858; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1859; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] 1860; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] 1861; AVX-NEXT: vmovdqa 112(%rdi), %xmm8 1862; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] 1863; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 1864; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1865; AVX-NEXT: vpslld $16, %xmm1, %xmm2 1866; AVX-NEXT: vmovdqa 64(%rdi), %xmm4 1867; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1868; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1869; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5 1870; AVX-NEXT: vmovdqa (%rdi), %xmm0 1871; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1872; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 1873; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1874; AVX-NEXT: vmovdqa 32(%rdi), %xmm7 1875; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 1876; AVX-NEXT: vpsrlq $16, %xmm7, %xmm10 1877; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3] 1878; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7] 1879; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 1880; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] 1881; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] 1882; AVX-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] 1883; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7] 1884; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7] 1885; AVX-NEXT: vmovdqa 176(%rdi), %xmm10 1886; AVX-NEXT: vpslld $16, %xmm10, %xmm5 1887; AVX-NEXT: vmovdqa 160(%rdi), %xmm11 1888; AVX-NEXT: vpsrldq {{.*#+}} xmm12 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1889; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] 1890; AVX-NEXT: vmovdqa 128(%rdi), %xmm12 1891; AVX-NEXT: vpsrlq $16, %xmm12, %xmm0 1892; AVX-NEXT: vmovdqa 144(%rdi), %xmm1 1893; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1894; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 1895; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] 1896; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 1897; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] 1898; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 1899; AVX-NEXT: vandps %ymm2, %ymm14, %ymm2 1900; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1901; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0 1902; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 1903; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1904; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] 1905; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1906; AVX-NEXT: vpsrld $16, %xmm8, %xmm2 1907; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1908; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1909; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] 1910; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 1911; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] 1912; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1913; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] 1914; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] 1915; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1916; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] 1917; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1918; AVX-NEXT: vpsrld $16, %xmm5, %xmm9 1919; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] 1920; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] 1921; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 1922; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1923; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] 1924; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1925; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] 1926; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] 1927; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 1928; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 1929; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 1930; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1 1931; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 1932; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1933; AVX-NEXT: vmovdqa %xmm6, %xmm2 1934; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 1935; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1936; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1937; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] 1938; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload 1939; AVX-NEXT: # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] 1940; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm3 1941; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] 1942; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1943; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] 1944; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 1945; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] 1946; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7] 1947; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm13 1948; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 1949; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 1950; AVX-NEXT: vandnps %ymm0, %ymm13, %ymm0 1951; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1 1952; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 1953; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 1954; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] 1955; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1956; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] 1957; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] 1958; AVX-NEXT: vpshufb %xmm9, %xmm8, %xmm9 1959; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] 1960; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 1961; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 1962; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1 1963; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 1964; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1965; AVX-NEXT: vmovdqa %xmm2, %xmm0 1966; AVX-NEXT: vpsrlq $48, %xmm2, %xmm1 1967; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] 1968; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0] 1969; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] 1970; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm2 1971; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 1972; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm2 1973; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] 1974; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm9 1975; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 1976; AVX-NEXT: vandnps %ymm1, %ymm13, %ymm1 1977; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 1978; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 1979; AVX-NEXT: vpsrlq $48, %xmm4, %xmm2 1980; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] 1981; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0] 1982; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 1983; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 1984; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1 1985; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 1986; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2 1987; AVX-NEXT: vorps %ymm2, %ymm1, %ymm14 1988; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1989; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] 1990; AVX-NEXT: vmovdqa %xmm5, %xmm13 1991; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] 1992; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1993; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1994; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 1995; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 1996; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] 1997; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm8 1998; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 1999; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2000; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] 2001; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 2002; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] 2003; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 2004; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] 2005; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6] 2006; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1] 2007; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] 2008; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 2009; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1 2010; AVX-NEXT: vandps %ymm9, %ymm8, %ymm8 2011; AVX-NEXT: vorps %ymm1, %ymm8, %ymm1 2012; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] 2013; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 2014; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] 2015; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6] 2016; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2017; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm5[1] 2018; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7] 2019; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2020; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 2021; AVX-NEXT: vpsrlq $48, %xmm6, %xmm3 2022; AVX-NEXT: vpsrldq {{.*#+}} xmm11 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2023; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] 2024; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] 2025; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 2026; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2027; AVX-NEXT: vpsrlq $48, %xmm4, %xmm3 2028; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2029; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 2030; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 2031; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] 2032; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] 2033; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] 2034; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2 2035; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 2036; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 2037; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm3 2038; AVX-NEXT: vpsrld $16, %xmm5, %xmm4 2039; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7] 2040; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] 2041; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] 2042; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 2043; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 2044; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2045; AVX-NEXT: vmovaps %ymm3, (%rsi) 2046; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2047; AVX-NEXT: vmovaps %ymm0, (%rdx) 2048; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2049; AVX-NEXT: vmovaps %ymm0, (%rcx) 2050; AVX-NEXT: vmovaps %ymm14, (%r8) 2051; AVX-NEXT: vmovaps %ymm1, (%r9) 2052; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2053; AVX-NEXT: vmovaps %ymm2, (%rax) 2054; AVX-NEXT: addq $88, %rsp 2055; AVX-NEXT: vzeroupper 2056; AVX-NEXT: retq 2057; 2058; AVX2-LABEL: load_i16_stride6_vf16: 2059; AVX2: # %bb.0: 2060; AVX2-NEXT: vmovdqa (%rdi), %ymm4 2061; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 2062; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 2063; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 2064; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 2065; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 2066; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 2067; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,2,2,2,4,5,6,7] 2068; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 2069; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] 2070; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] 2071; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 2072; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] 2073; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 2074; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 2075; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] 2076; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 2077; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] 2078; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] 2079; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] 2080; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2081; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2082; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] 2083; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 2084; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 2085; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3] 2086; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 2087; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2088; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2089; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2090; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[1,1,1,1,4,5,6,7] 2091; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7] 2092; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] 2093; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7] 2094; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7] 2095; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] 2096; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2097; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] 2098; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] 2099; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] 2100; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 2101; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 2102; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 2103; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] 2104; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] 2105; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7] 2106; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7] 2107; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] 2108; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2109; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2110; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 2111; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] 2112; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2113; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm12 2114; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 2115; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7] 2116; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] 2117; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2118; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 2119; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] 2120; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] 2121; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] 2122; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 2123; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] 2124; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7] 2125; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7] 2126; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] 2127; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] 2128; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] 2129; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] 2130; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] 2131; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2132; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 2133; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] 2134; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] 2135; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] 2136; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2137; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] 2138; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 2139; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] 2140; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2141; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 2142; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2143; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 2144; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] 2145; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 2146; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 2147; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 2148; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] 2149; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 2150; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] 2151; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 2152; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 2153; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 2154; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] 2155; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 2156; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] 2157; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] 2158; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] 2159; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 2160; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] 2161; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] 2162; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 2163; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 2164; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] 2165; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2166; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 2167; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 2168; AVX2-NEXT: vmovdqa %ymm3, (%rdx) 2169; AVX2-NEXT: vmovdqa %ymm8, (%rcx) 2170; AVX2-NEXT: vmovdqa %ymm9, (%r8) 2171; AVX2-NEXT: vmovdqa %ymm5, (%r9) 2172; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2173; AVX2-NEXT: vmovdqa %ymm1, (%rax) 2174; AVX2-NEXT: vzeroupper 2175; AVX2-NEXT: retq 2176; 2177; AVX2-FP-LABEL: load_i16_stride6_vf16: 2178; AVX2-FP: # %bb.0: 2179; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 2180; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 2181; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 2182; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 2183; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 2184; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 2185; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 2186; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 2187; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 2188; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 2189; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 2190; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] 2191; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 2192; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] 2193; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 2194; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 2195; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 2196; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] 2197; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 2198; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] 2199; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] 2200; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] 2201; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2202; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2203; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] 2204; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 2205; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 2206; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2207; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 2208; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2209; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2210; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2211; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 2212; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm10 2213; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] 2214; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] 2215; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] 2216; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2217; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] 2218; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] 2219; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 2220; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 2221; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 2222; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] 2223; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4] 2224; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3] 2225; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 2226; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] 2227; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2228; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 2229; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] 2230; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2231; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 2232; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] 2233; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 2234; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2235; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2236; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 2237; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] 2238; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 2239; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7] 2240; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] 2241; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5] 2242; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 2243; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7] 2244; AVX2-FP-NEXT: vpshufb %ymm9, %ymm15, %ymm9 2245; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7] 2246; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 2247; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] 2248; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2249; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] 2250; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] 2251; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] 2252; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] 2253; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 2254; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 2255; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] 2256; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2257; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] 2258; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2259; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 2260; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] 2261; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 2262; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 2263; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 2264; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 2265; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2266; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] 2267; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 2268; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 2269; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 2270; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 2271; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm6 2272; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 2273; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 2274; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] 2275; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 2276; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] 2277; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 2278; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2279; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 2280; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] 2281; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2282; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2283; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2284; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) 2285; AVX2-FP-NEXT: vmovdqa %ymm4, (%rdx) 2286; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) 2287; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) 2288; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) 2289; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2290; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) 2291; AVX2-FP-NEXT: vzeroupper 2292; AVX2-FP-NEXT: retq 2293; 2294; AVX2-FCP-LABEL: load_i16_stride6_vf16: 2295; AVX2-FCP: # %bb.0: 2296; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 2297; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 2298; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 2299; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 2300; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 2301; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 2302; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 2303; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 2304; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 2305; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 2306; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 2307; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] 2308; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 2309; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] 2310; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 2311; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 2312; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 2313; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] 2314; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 2315; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] 2316; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] 2317; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] 2318; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2319; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2320; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] 2321; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 2322; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 2323; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2324; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 2325; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2326; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2327; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2328; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 2329; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm10 2330; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] 2331; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] 2332; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] 2333; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2334; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] 2335; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] 2336; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 2337; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 2338; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10 2339; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] 2340; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4] 2341; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3] 2342; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 2343; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] 2344; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2345; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 2346; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] 2347; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2348; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 2349; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] 2350; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 2351; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2352; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2353; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 2354; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] 2355; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 2356; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7] 2357; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] 2358; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5] 2359; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 2360; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7] 2361; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm9 2362; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7] 2363; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 2364; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] 2365; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2366; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] 2367; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] 2368; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] 2369; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] 2370; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 2371; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 2372; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] 2373; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2374; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] 2375; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2376; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 2377; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] 2378; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 2379; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 2380; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 2381; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 2382; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2383; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] 2384; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 2385; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 2386; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 2387; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 2388; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm6 2389; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 2390; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 2391; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] 2392; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 2393; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] 2394; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 2395; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2396; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 2397; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] 2398; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2399; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2400; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2401; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) 2402; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx) 2403; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx) 2404; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) 2405; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) 2406; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2407; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) 2408; AVX2-FCP-NEXT: vzeroupper 2409; AVX2-FCP-NEXT: retq 2410; 2411; AVX512-LABEL: load_i16_stride6_vf16: 2412; AVX512: # %bb.0: 2413; AVX512-NEXT: vmovdqa 160(%rdi), %ymm0 2414; AVX512-NEXT: vmovdqa (%rdi), %ymm3 2415; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 2416; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 2417; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 2418; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2419; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] 2420; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm8 2421; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] 2422; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] 2423; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 2424; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] 2425; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 2426; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2427; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2428; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 2429; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 2430; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm13 2431; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] 2432; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] 2433; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] 2434; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] 2435; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] 2436; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] 2437; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 2438; AVX512-NEXT: vpshufb %xmm9, %xmm13, %xmm12 2439; AVX512-NEXT: vpshufb %xmm9, %xmm11, %xmm9 2440; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] 2441; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2442; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2443; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 2444; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2445; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] 2446; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 2447; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] 2448; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] 2449; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 2450; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 2451; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 2452; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] 2453; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] 2454; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7] 2455; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7] 2456; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] 2457; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2458; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2459; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 2460; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] 2461; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2462; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 2463; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 2464; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7] 2465; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] 2466; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2467; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 2468; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] 2469; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] 2470; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 2471; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] 2472; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7] 2473; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7] 2474; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] 2475; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] 2476; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] 2477; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] 2478; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] 2479; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2480; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 2481; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] 2482; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] 2483; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2484; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2485; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2486; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 2487; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 2488; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] 2489; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2490; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 2491; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 2492; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 2493; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 2494; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 2495; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] 2496; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2497; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7] 2498; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6] 2499; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] 2500; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 2501; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] 2502; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 2503; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 2504; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 2505; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 2506; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2507; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 2508; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] 2509; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 2510; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 2511; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7] 2512; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2513; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 2514; AVX512-NEXT: vmovdqa %ymm1, (%rsi) 2515; AVX512-NEXT: vmovdqa %ymm5, (%rdx) 2516; AVX512-NEXT: vmovdqa %ymm8, (%rcx) 2517; AVX512-NEXT: vmovdqa %ymm9, (%r8) 2518; AVX512-NEXT: vmovdqa %ymm7, (%r9) 2519; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 2520; AVX512-NEXT: vmovdqa %ymm0, (%rax) 2521; AVX512-NEXT: vzeroupper 2522; AVX512-NEXT: retq 2523; 2524; AVX512-FCP-LABEL: load_i16_stride6_vf16: 2525; AVX512-FCP: # %bb.0: 2526; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 2527; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 2528; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 2529; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 2530; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 2531; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 2532; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5 2533; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 2534; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] 2535; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 2536; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] 2537; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] 2538; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 2539; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2540; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2541; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] 2542; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 2543; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] 2544; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 2545; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 2546; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 2547; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm11 2548; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] 2549; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 2550; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] 2551; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] 2552; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 2553; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 2554; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 2555; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] 2556; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2557; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] 2558; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5] 2559; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2560; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2561; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2562; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] 2563; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 2564; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] 2565; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 2566; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 2567; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] 2568; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] 2569; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 2570; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] 2571; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2572; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2573; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 2574; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] 2575; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2576; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 2577; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 2578; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 2579; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2580; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 2581; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] 2582; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] 2583; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 2584; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] 2585; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 2586; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] 2587; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] 2588; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 2589; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] 2590; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2591; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 2592; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] 2593; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] 2594; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2595; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2596; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2597; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 2598; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2599; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] 2600; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2601; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 2602; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 2603; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 2604; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] 2605; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 2606; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 2607; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12 2608; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2609; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2610; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7] 2611; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 2612; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] 2613; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 2614; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 2615; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 2616; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 2617; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2618; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11) 2619; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 2620; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 2621; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2622; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] 2623; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2624; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 2625; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rsi) 2626; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx) 2627; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rcx) 2628; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8) 2629; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9) 2630; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2631; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 2632; AVX512-FCP-NEXT: vzeroupper 2633; AVX512-FCP-NEXT: retq 2634; 2635; AVX512DQ-LABEL: load_i16_stride6_vf16: 2636; AVX512DQ: # %bb.0: 2637; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm0 2638; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 2639; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 2640; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 2641; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 2642; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 2643; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] 2644; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8 2645; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] 2646; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] 2647; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 2648; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] 2649; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 2650; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2651; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2652; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 2653; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 2654; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm13 2655; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] 2656; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] 2657; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] 2658; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] 2659; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] 2660; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] 2661; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 2662; AVX512DQ-NEXT: vpshufb %xmm9, %xmm13, %xmm12 2663; AVX512DQ-NEXT: vpshufb %xmm9, %xmm11, %xmm9 2664; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] 2665; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2666; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2667; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 2668; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2669; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] 2670; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 2671; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] 2672; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] 2673; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] 2674; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 2675; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 2676; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] 2677; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] 2678; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7] 2679; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7] 2680; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] 2681; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2682; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2683; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 2684; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] 2685; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2686; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 2687; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 2688; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7] 2689; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] 2690; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2691; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 2692; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] 2693; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] 2694; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 2695; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] 2696; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7] 2697; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7] 2698; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] 2699; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] 2700; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] 2701; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] 2702; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] 2703; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2704; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 2705; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] 2706; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] 2707; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2708; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2709; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2710; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 2711; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 2712; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] 2713; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2714; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 2715; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 2716; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 2717; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 2718; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 2719; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] 2720; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2721; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7] 2722; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6] 2723; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] 2724; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 2725; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] 2726; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 2727; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 2728; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 2729; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 2730; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2731; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 2732; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] 2733; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 2734; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 2735; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7] 2736; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2737; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 2738; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi) 2739; AVX512DQ-NEXT: vmovdqa %ymm5, (%rdx) 2740; AVX512DQ-NEXT: vmovdqa %ymm8, (%rcx) 2741; AVX512DQ-NEXT: vmovdqa %ymm9, (%r8) 2742; AVX512DQ-NEXT: vmovdqa %ymm7, (%r9) 2743; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2744; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 2745; AVX512DQ-NEXT: vzeroupper 2746; AVX512DQ-NEXT: retq 2747; 2748; AVX512DQ-FCP-LABEL: load_i16_stride6_vf16: 2749; AVX512DQ-FCP: # %bb.0: 2750; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 2751; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 2752; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 2753; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 2754; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 2755; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 2756; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5 2757; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 2758; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] 2759; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 2760; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] 2761; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] 2762; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 2763; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] 2764; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 2765; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] 2766; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 2767; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] 2768; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 2769; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 2770; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 2771; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm11 2772; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] 2773; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 2774; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] 2775; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] 2776; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 2777; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 2778; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 2779; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] 2780; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 2781; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] 2782; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,5,5,5] 2783; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 2784; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] 2785; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2786; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] 2787; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 2788; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] 2789; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 2790; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 2791; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] 2792; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] 2793; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 2794; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] 2795; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2796; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] 2797; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 2798; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] 2799; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] 2800; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 2801; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 2802; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 2803; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] 2804; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 2805; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] 2806; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] 2807; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 2808; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] 2809; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 2810; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] 2811; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] 2812; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 2813; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] 2814; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2815; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 2816; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] 2817; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] 2818; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 2819; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] 2820; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2821; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 2822; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2823; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] 2824; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 2825; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 2826; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 2827; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 2828; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] 2829; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 2830; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 2831; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12 2832; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2833; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2834; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7] 2835; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 2836; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] 2837; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 2838; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 2839; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 2840; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 2841; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2842; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11) 2843; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 2844; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 2845; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2846; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] 2847; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 2848; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 2849; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rsi) 2850; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx) 2851; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rcx) 2852; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8) 2853; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9) 2854; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2855; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 2856; AVX512DQ-FCP-NEXT: vzeroupper 2857; AVX512DQ-FCP-NEXT: retq 2858; 2859; AVX512BW-LABEL: load_i16_stride6_vf16: 2860; AVX512BW: # %bb.0: 2861; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2862; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2863; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] 2864; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 2865; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 2866; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 2867; AVX512BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 2868; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 2869; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 2870; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 2871; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2872; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2873; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] 2874; AVX512BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 2875; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 2876; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 2877; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 2878; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2879; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2880; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 2881; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 2882; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 2883; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 2884; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 2885; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 2886; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 2887; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 2888; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2889; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] 2890; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 2891; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 2892; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 2893; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] 2894; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2895; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2896; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] 2897; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 2898; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 2899; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 2900; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 2901; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2902; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] 2903; AVX512BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 2904; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 2905; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 2906; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] 2907; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) 2908; AVX512BW-NEXT: vmovdqa %ymm1, (%rdx) 2909; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) 2910; AVX512BW-NEXT: vmovdqa %ymm6, (%r8) 2911; AVX512BW-NEXT: vmovdqa %ymm7, (%r9) 2912; AVX512BW-NEXT: vmovdqa %ymm3, (%rax) 2913; AVX512BW-NEXT: vzeroupper 2914; AVX512BW-NEXT: retq 2915; 2916; AVX512BW-FCP-LABEL: load_i16_stride6_vf16: 2917; AVX512BW-FCP: # %bb.0: 2918; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2919; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2920; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] 2921; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 2922; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 2923; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 2924; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 2925; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 2926; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 2927; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 2928; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2929; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2930; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] 2931; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 2932; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 2933; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 2934; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 2935; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2936; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2937; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] 2938; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 2939; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 2940; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 2941; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 2942; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 2943; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 2944; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 2945; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 2946; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] 2947; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 2948; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 2949; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 2950; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] 2951; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 2952; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 2953; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] 2954; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 2955; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 2956; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 2957; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 2958; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 2959; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] 2960; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 2961; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 2962; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 2963; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] 2964; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) 2965; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) 2966; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) 2967; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8) 2968; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) 2969; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax) 2970; AVX512BW-FCP-NEXT: vzeroupper 2971; AVX512BW-FCP-NEXT: retq 2972; 2973; AVX512DQ-BW-LABEL: load_i16_stride6_vf16: 2974; AVX512DQ-BW: # %bb.0: 2975; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2976; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 2977; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] 2978; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 2979; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 2980; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 2981; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 2982; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 2983; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 2984; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 2985; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2986; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 2987; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] 2988; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 2989; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 2990; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 2991; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 2992; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2993; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 2994; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] 2995; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 2996; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 2997; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 2998; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 2999; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 3000; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 3001; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 3002; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3003; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] 3004; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 3005; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 3006; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 3007; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] 3008; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 3009; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3010; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] 3011; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 3012; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 3013; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 3014; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 3015; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3016; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] 3017; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 3018; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 3019; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 3020; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] 3021; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) 3022; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rdx) 3023; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) 3024; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r8) 3025; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r9) 3026; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rax) 3027; AVX512DQ-BW-NEXT: vzeroupper 3028; AVX512DQ-BW-NEXT: retq 3029; 3030; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16: 3031; AVX512DQ-BW-FCP: # %bb.0: 3032; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3033; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] 3034; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] 3035; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 3036; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 3037; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 3038; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 3039; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 3040; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 3041; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 3042; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3043; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] 3044; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] 3045; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 3046; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 3047; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 3048; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 3049; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 3050; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] 3051; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] 3052; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 3053; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 3054; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 3055; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 3056; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 3057; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 3058; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 3059; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] 3060; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] 3061; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 3062; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 3063; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 3064; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] 3065; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 3066; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] 3067; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] 3068; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 3069; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 3070; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 3071; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] 3072; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] 3073; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] 3074; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 3075; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 3076; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 3077; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] 3078; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) 3079; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) 3080; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) 3081; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8) 3082; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) 3083; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax) 3084; AVX512DQ-BW-FCP-NEXT: vzeroupper 3085; AVX512DQ-BW-FCP-NEXT: retq 3086 %wide.vec = load <96 x i16>, ptr %in.vec, align 64 3087 %strided.vec0 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90> 3088 %strided.vec1 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91> 3089 %strided.vec2 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92> 3090 %strided.vec3 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93> 3091 %strided.vec4 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94> 3092 %strided.vec5 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95> 3093 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64 3094 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64 3095 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64 3096 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64 3097 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64 3098 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64 3099 ret void 3100} 3101 3102define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 3103; SSE-LABEL: load_i16_stride6_vf32: 3104; SSE: # %bb.0: 3105; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 3106; SSE-NEXT: movdqa 304(%rdi), %xmm9 3107; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3108; SSE-NEXT: movdqa 320(%rdi), %xmm5 3109; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3110; SSE-NEXT: movdqa 64(%rdi), %xmm3 3111; SSE-NEXT: movdqa 80(%rdi), %xmm0 3112; SSE-NEXT: movdqa (%rdi), %xmm4 3113; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3114; SSE-NEXT: movdqa 16(%rdi), %xmm6 3115; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3116; SSE-NEXT: movdqa 32(%rdi), %xmm1 3117; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3118; SSE-NEXT: movdqa 48(%rdi), %xmm7 3119; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3120; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] 3121; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] 3122; SSE-NEXT: movdqa %xmm10, %xmm2 3123; SSE-NEXT: pandn %xmm1, %xmm2 3124; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] 3125; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3126; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 3127; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 3128; SSE-NEXT: pand %xmm10, %xmm1 3129; SSE-NEXT: por %xmm2, %xmm1 3130; SSE-NEXT: movdqa %xmm1, %xmm2 3131; SSE-NEXT: movdqa %xmm3, %xmm1 3132; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] 3133; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 3134; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3135; SSE-NEXT: movdqa %xmm0, %xmm4 3136; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] 3137; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3138; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3139; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] 3140; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] 3141; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3142; SSE-NEXT: pslld $16, %xmm0 3143; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3144; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3145; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3] 3146; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] 3147; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] 3148; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 3149; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3150; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7] 3151; SSE-NEXT: movdqa %xmm10, %xmm4 3152; SSE-NEXT: movdqa %xmm10, %xmm1 3153; SSE-NEXT: pandn %xmm0, %xmm1 3154; SSE-NEXT: movdqa 288(%rdi), %xmm0 3155; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3156; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 3157; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3158; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 3159; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] 3160; SSE-NEXT: pand %xmm10, %xmm0 3161; SSE-NEXT: por %xmm1, %xmm0 3162; SSE-NEXT: movdqa %xmm0, %xmm5 3163; SSE-NEXT: movdqa 352(%rdi), %xmm2 3164; SSE-NEXT: movdqa 368(%rdi), %xmm1 3165; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] 3166; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3167; SSE-NEXT: movdqa %xmm1, %xmm0 3168; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] 3169; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3170; SSE-NEXT: movdqa %xmm2, %xmm0 3171; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3172; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] 3173; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] 3174; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3175; SSE-NEXT: pslld $16, %xmm1 3176; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3177; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3178; SSE-NEXT: movdqa 336(%rdi), %xmm1 3179; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3180; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 3181; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] 3182; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[1,3] 3183; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] 3184; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3185; SSE-NEXT: movdqa 224(%rdi), %xmm0 3186; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3187; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 3188; SSE-NEXT: movdqa %xmm10, %xmm2 3189; SSE-NEXT: pandn %xmm0, %xmm2 3190; SSE-NEXT: movdqa 208(%rdi), %xmm5 3191; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3192; SSE-NEXT: movdqa 192(%rdi), %xmm0 3193; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3194; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 3195; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3196; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 3197; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] 3198; SSE-NEXT: pand %xmm10, %xmm0 3199; SSE-NEXT: por %xmm2, %xmm0 3200; SSE-NEXT: movdqa %xmm0, %xmm2 3201; SSE-NEXT: movdqa 256(%rdi), %xmm5 3202; SSE-NEXT: movdqa 272(%rdi), %xmm7 3203; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] 3204; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 3205; SSE-NEXT: movdqa %xmm7, %xmm0 3206; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] 3207; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 3208; SSE-NEXT: movdqa %xmm5, %xmm0 3209; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3210; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] 3211; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3] 3212; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3213; SSE-NEXT: pslld $16, %xmm7 3214; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3215; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 3216; SSE-NEXT: movdqa 240(%rdi), %xmm5 3217; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3218; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] 3219; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7] 3220; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3] 3221; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 3222; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3223; SSE-NEXT: movdqa 128(%rdi), %xmm0 3224; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3225; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 3226; SSE-NEXT: movdqa %xmm10, %xmm7 3227; SSE-NEXT: pandn %xmm0, %xmm7 3228; SSE-NEXT: movdqa 112(%rdi), %xmm11 3229; SSE-NEXT: movdqa 96(%rdi), %xmm0 3230; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3231; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] 3232; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] 3233; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] 3234; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3235; SSE-NEXT: pand %xmm10, %xmm0 3236; SSE-NEXT: por %xmm7, %xmm0 3237; SSE-NEXT: movdqa 160(%rdi), %xmm5 3238; SSE-NEXT: movdqa 176(%rdi), %xmm9 3239; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] 3240; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] 3241; SSE-NEXT: movdqa %xmm9, %xmm2 3242; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] 3243; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3244; SSE-NEXT: movdqa %xmm5, %xmm10 3245; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3246; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0] 3247; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3] 3248; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3249; SSE-NEXT: pslld $16, %xmm9 3250; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3251; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 3252; SSE-NEXT: movdqa 144(%rdi), %xmm2 3253; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3254; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] 3255; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] 3256; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[1,3] 3257; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] 3258; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3259; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3260; SSE-NEXT: movdqa %xmm5, %xmm10 3261; SSE-NEXT: psrld $16, %xmm10 3262; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3263; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 3264; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] 3265; SSE-NEXT: movdqa %xmm4, %xmm10 3266; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3267; SSE-NEXT: pandn %xmm2, %xmm10 3268; SSE-NEXT: pand %xmm4, %xmm0 3269; SSE-NEXT: movdqa %xmm4, %xmm13 3270; SSE-NEXT: por %xmm10, %xmm0 3271; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] 3272; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3273; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[1,3] 3274; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] 3275; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3276; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3277; SSE-NEXT: movdqa %xmm7, %xmm8 3278; SSE-NEXT: psrld $16, %xmm8 3279; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3280; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 3281; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] 3282; SSE-NEXT: movdqa %xmm13, %xmm8 3283; SSE-NEXT: movdqa %xmm13, %xmm4 3284; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3285; SSE-NEXT: pandn %xmm10, %xmm4 3286; SSE-NEXT: pand %xmm13, %xmm0 3287; SSE-NEXT: por %xmm4, %xmm0 3288; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 3289; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] 3290; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] 3291; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3292; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3293; SSE-NEXT: psrld $16, %xmm1 3294; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3295; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 3296; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3297; SSE-NEXT: movdqa %xmm13, %xmm1 3298; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3299; SSE-NEXT: pandn %xmm13, %xmm1 3300; SSE-NEXT: pand %xmm8, %xmm0 3301; SSE-NEXT: por %xmm1, %xmm0 3302; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7] 3303; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] 3304; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] 3305; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3306; SSE-NEXT: psrld $16, %xmm11 3307; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] 3308; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] 3309; SSE-NEXT: pand %xmm8, %xmm3 3310; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3311; SSE-NEXT: pandn %xmm11, %xmm8 3312; SSE-NEXT: por %xmm3, %xmm8 3313; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] 3314; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3] 3315; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0] 3316; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3317; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3318; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3319; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] 3320; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 3321; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] 3322; SSE-NEXT: movdqa %xmm1, %xmm4 3323; SSE-NEXT: pandn %xmm2, %xmm4 3324; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3325; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0] 3326; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3] 3327; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] 3328; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 3329; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7] 3330; SSE-NEXT: pand %xmm1, %xmm6 3331; SSE-NEXT: por %xmm4, %xmm6 3332; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3333; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3334; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 3335; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3336; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 3337; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 3338; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4] 3339; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] 3340; SSE-NEXT: movdqa %xmm15, %xmm0 3341; SSE-NEXT: pandn %xmm4, %xmm0 3342; SSE-NEXT: pand %xmm15, %xmm6 3343; SSE-NEXT: por %xmm6, %xmm0 3344; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3345; SSE-NEXT: movdqa %xmm10, %xmm4 3346; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3347; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3348; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] 3349; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] 3350; SSE-NEXT: movdqa %xmm1, %xmm6 3351; SSE-NEXT: pandn %xmm4, %xmm6 3352; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3353; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0] 3354; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] 3355; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] 3356; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 3357; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] 3358; SSE-NEXT: pand %xmm1, %xmm8 3359; SSE-NEXT: por %xmm6, %xmm8 3360; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3361; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3362; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 3363; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3364; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] 3365; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 3366; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 3367; SSE-NEXT: movdqa %xmm15, %xmm0 3368; SSE-NEXT: pandn %xmm5, %xmm0 3369; SSE-NEXT: pand %xmm15, %xmm8 3370; SSE-NEXT: por %xmm8, %xmm0 3371; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3372; SSE-NEXT: movdqa %xmm13, %xmm5 3373; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3374; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3375; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] 3376; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 3377; SSE-NEXT: movdqa %xmm1, %xmm6 3378; SSE-NEXT: pandn %xmm5, %xmm6 3379; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3380; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3381; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] 3382; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] 3383; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] 3384; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 3385; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] 3386; SSE-NEXT: pand %xmm1, %xmm8 3387; SSE-NEXT: por %xmm6, %xmm8 3388; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3389; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload 3390; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 3391; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3392; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 3393; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 3394; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 3395; SSE-NEXT: movdqa %xmm15, %xmm0 3396; SSE-NEXT: pandn %xmm2, %xmm0 3397; SSE-NEXT: pand %xmm15, %xmm8 3398; SSE-NEXT: por %xmm8, %xmm0 3399; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 3400; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3401; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3402; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] 3403; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0] 3404; SSE-NEXT: movdqa %xmm1, %xmm6 3405; SSE-NEXT: pandn %xmm11, %xmm6 3406; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3407; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3408; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 3409; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 3410; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] 3411; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 3412; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] 3413; SSE-NEXT: pand %xmm1, %xmm8 3414; SSE-NEXT: por %xmm6, %xmm8 3415; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3416; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3417; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,2] 3418; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7] 3419; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] 3420; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 3421; SSE-NEXT: movdqa %xmm15, %xmm0 3422; SSE-NEXT: pandn %xmm6, %xmm0 3423; SSE-NEXT: pand %xmm15, %xmm8 3424; SSE-NEXT: por %xmm8, %xmm0 3425; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3426; SSE-NEXT: movdqa %xmm14, %xmm0 3427; SSE-NEXT: movdqa %xmm14, %xmm6 3428; SSE-NEXT: psrlq $48, %xmm6 3429; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3430; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] 3431; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] 3432; SSE-NEXT: movdqa %xmm1, %xmm6 3433; SSE-NEXT: pandn %xmm7, %xmm6 3434; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 3435; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 3436; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 3437; SSE-NEXT: pand %xmm1, %xmm3 3438; SSE-NEXT: por %xmm6, %xmm3 3439; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3440; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] 3441; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] 3442; SSE-NEXT: movdqa %xmm15, %xmm7 3443; SSE-NEXT: pandn %xmm6, %xmm7 3444; SSE-NEXT: pand %xmm15, %xmm3 3445; SSE-NEXT: por %xmm3, %xmm7 3446; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3447; SSE-NEXT: movdqa %xmm12, %xmm3 3448; SSE-NEXT: psrlq $48, %xmm3 3449; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3450; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] 3451; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] 3452; SSE-NEXT: movdqa %xmm1, %xmm3 3453; SSE-NEXT: pandn %xmm6, %xmm3 3454; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] 3455; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 3456; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] 3457; SSE-NEXT: pand %xmm1, %xmm4 3458; SSE-NEXT: por %xmm3, %xmm4 3459; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3460; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] 3461; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] 3462; SSE-NEXT: movdqa %xmm15, %xmm6 3463; SSE-NEXT: pandn %xmm3, %xmm6 3464; SSE-NEXT: pand %xmm15, %xmm4 3465; SSE-NEXT: por %xmm4, %xmm6 3466; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3467; SSE-NEXT: movdqa %xmm10, %xmm3 3468; SSE-NEXT: movdqa %xmm10, %xmm14 3469; SSE-NEXT: psrlq $48, %xmm3 3470; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] 3471; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] 3472; SSE-NEXT: movdqa %xmm1, %xmm3 3473; SSE-NEXT: pandn %xmm4, %xmm3 3474; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] 3475; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 3476; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] 3477; SSE-NEXT: pand %xmm1, %xmm4 3478; SSE-NEXT: por %xmm3, %xmm4 3479; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3480; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] 3481; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] 3482; SSE-NEXT: movdqa %xmm15, %xmm5 3483; SSE-NEXT: pandn %xmm3, %xmm5 3484; SSE-NEXT: pand %xmm15, %xmm4 3485; SSE-NEXT: por %xmm4, %xmm5 3486; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3487; SSE-NEXT: movdqa %xmm9, %xmm3 3488; SSE-NEXT: psrlq $48, %xmm3 3489; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3490; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] 3491; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] 3492; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 3493; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 3494; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 3495; SSE-NEXT: pand %xmm1, %xmm2 3496; SSE-NEXT: pandn %xmm4, %xmm1 3497; SSE-NEXT: por %xmm2, %xmm1 3498; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] 3499; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 3500; SSE-NEXT: movdqa %xmm15, %xmm3 3501; SSE-NEXT: pandn %xmm2, %xmm3 3502; SSE-NEXT: pand %xmm15, %xmm1 3503; SSE-NEXT: por %xmm1, %xmm3 3504; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3505; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3506; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 3507; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3508; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] 3509; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3510; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] 3511; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6] 3512; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 3513; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] 3514; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3515; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] 3516; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 3517; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 3518; SSE-NEXT: movdqa %xmm15, %xmm11 3519; SSE-NEXT: pandn %xmm3, %xmm11 3520; SSE-NEXT: andps %xmm15, %xmm1 3521; SSE-NEXT: por %xmm1, %xmm11 3522; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3523; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] 3524; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3525; SSE-NEXT: # xmm3 = mem[2,3,2,3] 3526; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3527; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] 3528; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3529; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] 3530; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3531; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] 3532; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] 3533; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3534; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] 3535; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 3536; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 3537; SSE-NEXT: movdqa %xmm15, %xmm5 3538; SSE-NEXT: pandn %xmm3, %xmm5 3539; SSE-NEXT: andps %xmm15, %xmm4 3540; SSE-NEXT: por %xmm4, %xmm5 3541; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3542; SSE-NEXT: # xmm3 = mem[1,1,1,1] 3543; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 3544; SSE-NEXT: # xmm4 = mem[2,3,2,3] 3545; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 3546; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3547; SSE-NEXT: # xmm0 = mem[0,1,0,3] 3548; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3549; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] 3550; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm14[1] 3551; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 3552; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 3553; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 3554; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 3555; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6] 3556; SSE-NEXT: movdqa %xmm15, %xmm4 3557; SSE-NEXT: pandn %xmm14, %xmm4 3558; SSE-NEXT: andps %xmm15, %xmm3 3559; SSE-NEXT: por %xmm3, %xmm4 3560; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 3561; SSE-NEXT: # xmm14 = mem[1,1,1,1] 3562; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3563; SSE-NEXT: # xmm3 = mem[2,3,2,3] 3564; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] 3565; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3] 3566; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6] 3567; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] 3568; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] 3569; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3570; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] 3571; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 3572; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] 3573; SSE-NEXT: movdqa %xmm15, %xmm3 3574; SSE-NEXT: pandn %xmm1, %xmm3 3575; SSE-NEXT: andps %xmm15, %xmm0 3576; SSE-NEXT: por %xmm0, %xmm3 3577; SSE-NEXT: movdqa %xmm2, %xmm1 3578; SSE-NEXT: psrlq $48, %xmm1 3579; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3580; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 3581; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3582; SSE-NEXT: psrld $16, %xmm1 3583; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] 3584; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 3585; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] 3586; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] 3587; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 3588; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] 3589; SSE-NEXT: movdqa %xmm15, %xmm2 3590; SSE-NEXT: pandn %xmm1, %xmm2 3591; SSE-NEXT: andps %xmm15, %xmm0 3592; SSE-NEXT: por %xmm0, %xmm2 3593; SSE-NEXT: psrlq $48, %xmm13 3594; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3595; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3596; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] 3597; SSE-NEXT: movdqa %xmm0, %xmm1 3598; SSE-NEXT: psrld $16, %xmm12 3599; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3600; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] 3601; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] 3602; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 3603; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3604; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] 3605; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 3606; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] 3607; SSE-NEXT: movdqa %xmm15, %xmm1 3608; SSE-NEXT: pandn %xmm8, %xmm1 3609; SSE-NEXT: andps %xmm15, %xmm0 3610; SSE-NEXT: por %xmm0, %xmm1 3611; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3612; SSE-NEXT: psrlq $48, %xmm6 3613; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3614; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3615; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 3616; SSE-NEXT: movdqa %xmm0, %xmm6 3617; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3618; SSE-NEXT: psrld $16, %xmm7 3619; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3620; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] 3621; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] 3622; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] 3623; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 3624; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] 3625; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] 3626; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] 3627; SSE-NEXT: movdqa %xmm15, %xmm9 3628; SSE-NEXT: pandn %xmm8, %xmm9 3629; SSE-NEXT: andps %xmm15, %xmm0 3630; SSE-NEXT: por %xmm0, %xmm9 3631; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3632; SSE-NEXT: psrlq $48, %xmm6 3633; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3634; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3635; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 3636; SSE-NEXT: movdqa %xmm0, %xmm6 3637; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3638; SSE-NEXT: psrld $16, %xmm7 3639; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] 3640; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] 3641; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] 3642; SSE-NEXT: andps %xmm15, %xmm0 3643; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 3644; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] 3645; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] 3646; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] 3647; SSE-NEXT: pandn %xmm8, %xmm15 3648; SSE-NEXT: por %xmm0, %xmm15 3649; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3650; SSE-NEXT: movaps %xmm0, 16(%rsi) 3651; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3652; SSE-NEXT: movaps %xmm0, 32(%rsi) 3653; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3654; SSE-NEXT: movaps %xmm0, 48(%rsi) 3655; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3656; SSE-NEXT: movaps %xmm0, (%rsi) 3657; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3658; SSE-NEXT: movaps %xmm0, 16(%rdx) 3659; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3660; SSE-NEXT: movaps %xmm0, 32(%rdx) 3661; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3662; SSE-NEXT: movaps %xmm0, 48(%rdx) 3663; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3664; SSE-NEXT: movaps %xmm0, (%rdx) 3665; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3666; SSE-NEXT: movaps %xmm0, 16(%rcx) 3667; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 3668; SSE-NEXT: movaps %xmm0, 32(%rcx) 3669; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3670; SSE-NEXT: movaps %xmm0, 48(%rcx) 3671; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3672; SSE-NEXT: movaps %xmm0, (%rcx) 3673; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3674; SSE-NEXT: movaps %xmm0, 16(%r8) 3675; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3676; SSE-NEXT: movaps %xmm0, 32(%r8) 3677; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3678; SSE-NEXT: movaps %xmm0, 48(%r8) 3679; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3680; SSE-NEXT: movaps %xmm0, (%r8) 3681; SSE-NEXT: movdqa %xmm3, 16(%r9) 3682; SSE-NEXT: movdqa %xmm4, 32(%r9) 3683; SSE-NEXT: movdqa %xmm5, 48(%r9) 3684; SSE-NEXT: movdqa %xmm11, (%r9) 3685; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 3686; SSE-NEXT: movdqa %xmm15, 16(%rax) 3687; SSE-NEXT: movdqa %xmm9, 32(%rax) 3688; SSE-NEXT: movdqa %xmm1, 48(%rax) 3689; SSE-NEXT: movdqa %xmm2, (%rax) 3690; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 3691; SSE-NEXT: retq 3692; 3693; AVX-LABEL: load_i16_stride6_vf32: 3694; AVX: # %bb.0: 3695; AVX-NEXT: subq $552, %rsp # imm = 0x228 3696; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 3697; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3698; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] 3699; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] 3700; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 3701; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3702; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3703; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 3704; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3705; AVX-NEXT: vpslld $16, %xmm1, %xmm1 3706; AVX-NEXT: vmovdqa 64(%rdi), %xmm12 3707; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3708; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3709; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3710; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3711; AVX-NEXT: vmovdqa (%rdi), %xmm3 3712; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3713; AVX-NEXT: vmovdqa 16(%rdi), %xmm10 3714; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 3715; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3716; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 3717; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3718; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1 3719; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] 3720; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7] 3721; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3722; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] 3723; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] 3724; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] 3725; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] 3726; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3727; AVX-NEXT: vmovdqa 176(%rdi), %xmm1 3728; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3729; AVX-NEXT: vpslld $16, %xmm1, %xmm1 3730; AVX-NEXT: vmovdqa 160(%rdi), %xmm2 3731; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3732; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3733; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3734; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 3735; AVX-NEXT: vpsrlq $16, %xmm14, %xmm2 3736; AVX-NEXT: vmovdqa 144(%rdi), %xmm3 3737; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3738; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] 3739; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] 3740; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3741; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] 3742; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 3743; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 3744; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3745; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 3746; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3747; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3748; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 3749; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3750; AVX-NEXT: vpslld $16, %xmm0, %xmm0 3751; AVX-NEXT: vmovdqa 256(%rdi), %xmm1 3752; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3753; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3754; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3755; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 3756; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3757; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] 3758; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] 3759; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 3760; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3761; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 3762; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3763; AVX-NEXT: vmovdqa 224(%rdi), %xmm1 3764; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3765; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1 3766; AVX-NEXT: vmovdqa 240(%rdi), %xmm2 3767; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3768; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] 3769; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] 3770; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 3771; AVX-NEXT: vmovdqa 192(%rdi), %xmm2 3772; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3773; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] 3774; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] 3775; AVX-NEXT: vmovdqa 208(%rdi), %xmm2 3776; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3777; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] 3778; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7] 3779; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3780; AVX-NEXT: vmovdqa 368(%rdi), %xmm0 3781; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3782; AVX-NEXT: vpslld $16, %xmm0, %xmm1 3783; AVX-NEXT: vmovdqa 352(%rdi), %xmm0 3784; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3785; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3786; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] 3787; AVX-NEXT: vmovdqa 320(%rdi), %xmm0 3788; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3789; AVX-NEXT: vpsrlq $16, %xmm0, %xmm0 3790; AVX-NEXT: vmovdqa 336(%rdi), %xmm1 3791; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3792; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 3793; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7] 3794; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 3795; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] 3796; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 3797; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3798; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 3799; AVX-NEXT: vmovaps %ymm5, %ymm9 3800; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 3801; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3802; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] 3803; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3804; AVX-NEXT: vpsrld $16, %xmm11, %xmm2 3805; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3806; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] 3807; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3808; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 3809; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 3810; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] 3811; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3812; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] 3813; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] 3814; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7] 3815; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3816; AVX-NEXT: vpsrld $16, %xmm10, %xmm8 3817; AVX-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] 3818; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7] 3819; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3820; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] 3821; AVX-NEXT: vmovdqa %xmm14, %xmm7 3822; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] 3823; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 3824; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3825; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] 3826; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3827; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] 3828; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7] 3829; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0 3830; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 3831; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2 3832; AVX-NEXT: vmovaps %ymm9, %ymm6 3833; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 3834; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3835; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] 3836; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3837; AVX-NEXT: vpsrld $16, %xmm15, %xmm2 3838; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3839; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3840; AVX-NEXT: # xmm2 = mem[2,2,3,3] 3841; AVX-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload 3842; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 3843; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 3844; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] 3845; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3846; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] 3847; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 3848; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] 3849; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3850; AVX-NEXT: vpsrld $16, %xmm13, %xmm4 3851; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 3852; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] 3853; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 3854; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 3855; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3856; AVX-NEXT: # xmm2 = mem[1,1,1,1] 3857; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3858; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3859; AVX-NEXT: # xmm2 = mem[2,2,3,3] 3860; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3861; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 3862; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 3863; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 3864; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3865; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1 3866; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3867; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3868; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3869; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 3870; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3871; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 3872; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] 3873; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload 3874; AVX-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] 3875; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3876; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 3877; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 3878; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload 3879; AVX-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] 3880; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3881; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 3882; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] 3883; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload 3884; AVX-NEXT: # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7] 3885; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm6 3886; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 3887; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 3888; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5 3889; AVX-NEXT: vandps %ymm1, %ymm6, %ymm6 3890; AVX-NEXT: vorps %ymm5, %ymm6, %ymm6 3891; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3892; AVX-NEXT: # xmm5 = mem[1,1,1,1] 3893; AVX-NEXT: vmovdqa %xmm7, %xmm12 3894; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3895; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3896; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] 3897; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7] 3898; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm8 3899; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] 3900; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 3901; AVX-NEXT: vandps %ymm6, %ymm8, %ymm6 3902; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 3903; AVX-NEXT: vandnps %ymm7, %ymm8, %ymm7 3904; AVX-NEXT: vorps %ymm7, %ymm6, %ymm6 3905; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3906; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3907; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] 3908; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3909; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] 3910; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload 3911; AVX-NEXT: # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] 3912; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3913; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm8 3914; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] 3915; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload 3916; AVX-NEXT: # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] 3917; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3918; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload 3919; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload 3920; AVX-NEXT: # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7] 3921; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm15 3922; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 3923; AVX-NEXT: vandnps %ymm11, %ymm1, %ymm11 3924; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 3925; AVX-NEXT: vorps %ymm0, %ymm11, %ymm11 3926; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3927; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] 3928; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3929; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3930; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] 3931; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload 3932; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7] 3933; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 3934; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7] 3935; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 3936; AVX-NEXT: vandps %ymm15, %ymm11, %ymm11 3937; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 3938; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 3939; AVX-NEXT: vorps %ymm2, %ymm11, %ymm2 3940; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3941; AVX-NEXT: vpsrlq $48, %xmm4, %xmm2 3942; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3943; AVX-NEXT: # xmm11 = mem[2,2,3,3] 3944; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0] 3945; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] 3946; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3947; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm9 3948; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7] 3949; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3950; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm3 3951; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] 3952; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm4 3953; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 3954; AVX-NEXT: vandnps %ymm9, %ymm1, %ymm4 3955; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3 3956; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 3957; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3958; AVX-NEXT: vpsrlq $48, %xmm10, %xmm4 3959; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] 3960; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0] 3961; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 3962; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] 3963; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 3964; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 3965; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 3966; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 3967; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3968; AVX-NEXT: vpsrlq $48, %xmm14, %xmm3 3969; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3970; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] 3971; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 3972; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3973; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 3974; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] 3975; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm4 3976; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5 3977; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 3978; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 3979; AVX-NEXT: vandps %ymm1, %ymm4, %ymm1 3980; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 3981; AVX-NEXT: vpsrlq $48, %xmm13, %xmm3 3982; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] 3983; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 3984; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3985; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] 3986; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1 3987; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 3988; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 3989; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 3990; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3991; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3992; AVX-NEXT: # xmm1 = mem[1,1,1,1] 3993; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3994; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] 3995; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3996; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3997; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3998; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 3999; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4000; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] 4001; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2 4002; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 4003; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 4004; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] 4005; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4006; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 4007; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 4008; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4009; AVX-NEXT: # xmm0 = mem[0,1,0,3] 4010; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4011; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6] 4012; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 4013; AVX-NEXT: # xmm5 = xmm5[1],mem[1] 4014; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7] 4015; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 4016; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 4017; AVX-NEXT: vandps %ymm2, %ymm5, %ymm5 4018; AVX-NEXT: vorps %ymm1, %ymm5, %ymm1 4019; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4020; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 4021; AVX-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 4022; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 4023; AVX-NEXT: # xmm7 = mem[0,1,0,3] 4024; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6] 4025; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm10[1] 4026; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm9 4027; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] 4028; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 4029; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] 4030; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4031; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 4032; AVX-NEXT: # xmm8 = mem[1,1,1,1] 4033; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4034; AVX-NEXT: # xmm9 = mem[2,3,2,3] 4035; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 4036; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4037; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4038; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 4039; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4040; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm10 4041; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 4042; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 4043; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] 4044; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4045; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] 4046; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 4047; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] 4048; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6] 4049; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4050; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] 4051; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7] 4052; AVX-NEXT: vandnps %ymm9, %ymm2, %ymm9 4053; AVX-NEXT: vandps %ymm2, %ymm11, %ymm11 4054; AVX-NEXT: vorps %ymm9, %ymm11, %ymm13 4055; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4056; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload 4057; AVX-NEXT: # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 4058; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 4059; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 4060; AVX-NEXT: # xmm9 = mem[0,1,0,3] 4061; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6] 4062; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4063; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] 4064; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7] 4065; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 4066; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7] 4067; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4068; AVX-NEXT: vpsrlq $48, %xmm12, %xmm12 4069; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4070; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] 4071; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] 4072; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4073; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm4 4074; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 4075; AVX-NEXT: vpsrlq $48, %xmm14, %xmm13 4076; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4077; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 4078; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4079; AVX-NEXT: vpsrld $16, %xmm6, %xmm14 4080; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 4081; AVX-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] 4082; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1] 4083; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] 4084; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm4 4085; AVX-NEXT: vandps %ymm2, %ymm6, %ymm6 4086; AVX-NEXT: vorps %ymm4, %ymm6, %ymm4 4087; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4088; AVX-NEXT: vpsrld $16, %xmm6, %xmm6 4089; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7] 4090; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] 4091; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm5 4092; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] 4093; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 4094; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 4095; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4096; AVX-NEXT: vpsrlq $48, %xmm5, %xmm5 4097; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4098; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4099; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 4100; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4101; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 4102; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 4103; AVX-NEXT: vpsrlq $48, %xmm15, %xmm6 4104; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4105; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 4106; AVX-NEXT: vpsrld $16, %xmm1, %xmm7 4107; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7] 4108; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1] 4109; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] 4110; AVX-NEXT: vandnps %ymm5, %ymm2, %ymm5 4111; AVX-NEXT: vandps %ymm2, %ymm6, %ymm2 4112; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 4113; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm5 4114; AVX-NEXT: vpsrld $16, %xmm0, %xmm6 4115; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7] 4116; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] 4117; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] 4118; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 4119; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] 4120; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4121; AVX-NEXT: vmovaps %ymm0, 32(%rsi) 4122; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 4123; AVX-NEXT: vmovaps %ymm5, (%rsi) 4124; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4125; AVX-NEXT: vmovaps %ymm0, 32(%rdx) 4126; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4127; AVX-NEXT: vmovaps %ymm0, (%rdx) 4128; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4129; AVX-NEXT: vmovaps %ymm0, 32(%rcx) 4130; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4131; AVX-NEXT: vmovaps %ymm0, (%rcx) 4132; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4133; AVX-NEXT: vmovaps %ymm0, 32(%r8) 4134; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4135; AVX-NEXT: vmovaps %ymm0, (%r8) 4136; AVX-NEXT: vmovaps %ymm3, 32(%r9) 4137; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4138; AVX-NEXT: vmovaps %ymm0, (%r9) 4139; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 4140; AVX-NEXT: vmovaps %ymm2, 32(%rax) 4141; AVX-NEXT: vmovaps %ymm4, (%rax) 4142; AVX-NEXT: addq $552, %rsp # imm = 0x228 4143; AVX-NEXT: vzeroupper 4144; AVX-NEXT: retq 4145; 4146; AVX2-LABEL: load_i16_stride6_vf32: 4147; AVX2: # %bb.0: 4148; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 4149; AVX2-NEXT: vmovdqa (%rdi), %ymm5 4150; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4151; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 4152; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill 4153; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 4154; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 4155; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 4156; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 4157; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 4158; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3 4159; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] 4160; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] 4161; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4162; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4163; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 4164; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4165; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 4166; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] 4167; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm4 4168; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 4169; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 4170; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm0 4171; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 4172; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] 4173; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] 4174; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 4175; AVX2-NEXT: vpblendvb %ymm0, %ymm9, %ymm4, %ymm4 4176; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4177; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4178; AVX2-NEXT: vmovdqa %ymm11, %ymm5 4179; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4180; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] 4181; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 4182; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 4183; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] 4184; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] 4185; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4186; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4187; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 4188; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm6 4189; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm6 4190; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4191; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 4192; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 4193; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] 4194; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 4195; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 4196; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 4197; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 4198; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 4199; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4200; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm1 4201; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm3 4202; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7] 4203; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3] 4204; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 4205; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 4206; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 4207; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 4208; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4209; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm9 4210; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3] 4211; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 4212; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm1 4213; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 4214; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 4215; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] 4216; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm1 4217; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4218; AVX2-NEXT: vmovdqa 352(%rdi), %ymm11 4219; AVX2-NEXT: vmovdqa 320(%rdi), %ymm13 4220; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] 4221; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4222; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4223; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] 4224; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 4225; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7] 4226; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 4227; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 4228; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4229; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] 4230; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 4231; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4232; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 4233; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4234; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 4235; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] 4236; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] 4237; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 4238; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7] 4239; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 4240; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4241; AVX2-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 4242; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 4243; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 4244; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 4245; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3] 4246; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] 4247; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7] 4248; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 4249; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload 4250; AVX2-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7] 4251; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm15 4252; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] 4253; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4254; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] 4255; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 4256; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4257; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3] 4258; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] 4259; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 4260; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 4261; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7] 4262; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 4263; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm7 4264; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] 4265; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] 4266; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] 4267; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 4268; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm8 4269; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 4270; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7] 4271; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4272; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 4273; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 4274; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4275; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm1 4276; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5] 4277; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 4278; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm2 4279; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3] 4280; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] 4281; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 4282; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 4283; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6],xmm3[7] 4284; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] 4285; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] 4286; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 4287; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4288; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 4289; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4290; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4291; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4292; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 4293; AVX2-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 4294; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] 4295; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4296; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] 4297; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] 4298; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] 4299; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 4300; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] 4301; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 4302; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4303; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4304; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 4305; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] 4306; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 4307; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] 4308; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] 4309; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 4310; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] 4311; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 4312; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4313; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 4314; AVX2-NEXT: vpshufb %ymm11, %ymm8, %ymm2 4315; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 4316; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 4317; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 4318; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 4319; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4320; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 4321; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] 4322; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4323; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] 4324; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] 4325; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7] 4326; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 4327; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] 4328; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 4329; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4330; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 4331; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 4332; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4333; AVX2-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 4334; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 4335; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] 4336; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 4337; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 4338; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] 4339; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] 4340; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] 4341; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] 4342; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4343; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm11 4344; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] 4345; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] 4346; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] 4347; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] 4348; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] 4349; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] 4350; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] 4351; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] 4352; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] 4353; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] 4354; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] 4355; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] 4356; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 4357; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm8 4358; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4359; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] 4360; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] 4361; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] 4362; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 4363; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] 4364; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 4365; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] 4366; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 4367; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 4368; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] 4369; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 4370; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 4371; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 4372; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 4373; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 4374; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 4375; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 4376; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4377; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] 4378; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 4379; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 4380; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] 4381; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] 4382; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 4383; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6 4384; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] 4385; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4386; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 4387; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] 4388; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4389; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 4390; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] 4391; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 4392; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 4393; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] 4394; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] 4395; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] 4396; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] 4397; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4398; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 4399; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] 4400; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 4401; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 4402; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 4403; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 4404; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] 4405; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4406; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4407; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 4408; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm2 4409; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] 4410; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] 4411; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] 4412; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 4413; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 4414; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] 4415; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4416; AVX2-NEXT: vmovaps %ymm6, 32(%rsi) 4417; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4418; AVX2-NEXT: vmovaps %ymm6, (%rsi) 4419; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4420; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) 4421; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4422; AVX2-NEXT: vmovaps %ymm6, (%rdx) 4423; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4424; AVX2-NEXT: vmovaps %ymm6, 32(%rcx) 4425; AVX2-NEXT: vmovdqa %ymm9, (%rcx) 4426; AVX2-NEXT: vmovdqa %ymm5, 32(%r8) 4427; AVX2-NEXT: vmovdqa %ymm0, (%r8) 4428; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) 4429; AVX2-NEXT: vmovdqa %ymm3, (%r9) 4430; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 4431; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) 4432; AVX2-NEXT: vmovdqa %ymm1, (%rax) 4433; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8 4434; AVX2-NEXT: vzeroupper 4435; AVX2-NEXT: retq 4436; 4437; AVX2-FP-LABEL: load_i16_stride6_vf32: 4438; AVX2-FP: # %bb.0: 4439; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8 4440; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 4441; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4442; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 4443; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4444; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 4445; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 4446; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm9 4447; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10 4448; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 4449; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm3 4450; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] 4451; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] 4452; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4453; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4454; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 4455; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 4456; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 4457; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] 4458; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm6 4459; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 4460; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 4461; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm0 4462; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 4463; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7] 4464; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] 4465; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 4466; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 4467; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4468; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4469; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4470; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] 4471; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm7 4472; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 4473; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] 4474; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] 4475; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4476; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 4477; AVX2-FP-NEXT: vpshufb %ymm4, %ymm11, %ymm4 4478; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4 4479; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4480; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 4481; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 4482; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4483; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 4484; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 4485; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 4486; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 4487; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4488; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm1 4489; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm3 4490; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 4491; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] 4492; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] 4493; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 4494; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 4495; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4496; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 4497; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm1 4498; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] 4499; AVX2-FP-NEXT: vpshufb %xmm15, %xmm11, %xmm1 4500; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 4501; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 4502; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] 4503; AVX2-FP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 4504; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4505; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1 4506; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4507; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2 4508; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4509; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 4510; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 4511; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 4512; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9 4513; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm8 4514; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] 4515; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4516; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] 4517; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 4518; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4519; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 4520; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4521; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 4522; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 4523; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm0 4524; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 4525; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 4526; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] 4527; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4528; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 4529; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 4530; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm12 4531; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] 4532; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 4533; AVX2-FP-NEXT: vpshufb %xmm15, %xmm12, %xmm15 4534; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] 4535; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4536; AVX2-FP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload 4537; AVX2-FP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7] 4538; AVX2-FP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 4539; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] 4540; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4541; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] 4542; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 4543; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4544; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] 4545; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 4546; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] 4547; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] 4548; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 4549; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 4550; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] 4551; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 4552; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7] 4553; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 4554; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 4555; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4556; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] 4557; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4558; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15] 4559; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] 4560; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4561; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 4562; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 4563; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] 4564; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 4565; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 4566; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7] 4567; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7] 4568; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 4569; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] 4570; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4571; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 4572; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4573; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4574; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 4575; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] 4576; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4577; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4578; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 4579; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] 4580; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 4581; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] 4582; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 4583; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 4584; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] 4585; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] 4586; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4587; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4588; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 4589; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 4590; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] 4591; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] 4592; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 4593; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 4594; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] 4595; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] 4596; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4597; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 4598; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm12 4599; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15] 4600; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] 4601; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7] 4602; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7] 4603; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4604; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4605; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] 4606; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3] 4607; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 4608; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm9 4609; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 4610; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4] 4611; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] 4612; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4613; AVX2-FP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload 4614; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 4615; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4616; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4617; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 4618; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11 4619; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] 4620; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 4621; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 4622; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7] 4623; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] 4624; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4625; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 4626; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] 4627; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 4628; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] 4629; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7] 4630; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 4631; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 4632; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] 4633; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 4634; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 4635; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 4636; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] 4637; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] 4638; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 4639; AVX2-FP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 4640; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4641; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] 4642; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] 4643; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] 4644; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7] 4645; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 4646; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] 4647; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 4648; AVX2-FP-NEXT: vpshufb %ymm10, %ymm13, %ymm2 4649; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm3 4650; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 4651; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] 4652; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4653; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 4654; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 4655; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 4656; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4657; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] 4658; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 4659; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 4660; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 4661; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 4662; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm7 4663; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] 4664; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 4665; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4666; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] 4667; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4668; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 4669; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] 4670; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 4671; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] 4672; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm9 4673; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 4674; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7] 4675; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4676; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 4677; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] 4678; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 4679; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 4680; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 4681; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] 4682; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4683; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4684; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 4685; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm2 4686; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 4687; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] 4688; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 4689; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 4690; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] 4691; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4692; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) 4693; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4694; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) 4695; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4696; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) 4697; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4698; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) 4699; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4700; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) 4701; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx) 4702; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r8) 4703; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8) 4704; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) 4705; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) 4706; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4707; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) 4708; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) 4709; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8 4710; AVX2-FP-NEXT: vzeroupper 4711; AVX2-FP-NEXT: retq 4712; 4713; AVX2-FCP-LABEL: load_i16_stride6_vf32: 4714; AVX2-FCP: # %bb.0: 4715; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8 4716; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 4717; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4718; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 4719; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4720; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 4721; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 4722; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 4723; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 4724; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 4725; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 4726; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] 4727; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] 4728; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 4729; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4730; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] 4731; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 4732; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 4733; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] 4734; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm6 4735; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 4736; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 4737; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm0 4738; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 4739; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7] 4740; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] 4741; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 4742; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 4743; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4744; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4745; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4746; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] 4747; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7 4748; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 4749; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7] 4750; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] 4751; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4752; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 4753; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4 4754; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4 4755; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4756; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 4757; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 4758; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4759; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 4760; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 4761; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 4762; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 4763; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4764; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm1 4765; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm3 4766; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 4767; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] 4768; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] 4769; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 4770; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 4771; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4772; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 4773; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 4774; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] 4775; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm1 4776; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 4777; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 4778; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] 4779; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 4780; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 4781; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 4782; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4783; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 4784; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4785; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 4786; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 4787; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 4788; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 4789; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm8 4790; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] 4791; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4792; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] 4793; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] 4794; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4795; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 4796; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4797; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 4798; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 4799; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm0 4800; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 4801; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 4802; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] 4803; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4804; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 4805; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 4806; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm12 4807; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] 4808; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 4809; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm15 4810; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] 4811; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 4812; AVX2-FCP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload 4813; AVX2-FCP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7] 4814; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 4815; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] 4816; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4817; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] 4818; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 4819; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4820; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] 4821; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 4822; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] 4823; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] 4824; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 4825; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 4826; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] 4827; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 4828; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7] 4829; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 4830; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 4831; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 4832; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] 4833; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4834; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15] 4835; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] 4836; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4837; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 4838; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 4839; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] 4840; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 4841; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 4842; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7] 4843; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7] 4844; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 4845; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] 4846; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4847; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 4848; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4849; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4850; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload 4851; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] 4852; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4853; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4854; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 4855; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] 4856; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 4857; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] 4858; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 4859; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 4860; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] 4861; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] 4862; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4863; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4864; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 4865; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 4866; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] 4867; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] 4868; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 4869; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 4870; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] 4871; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] 4872; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4873; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 4874; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm12 4875; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15] 4876; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] 4877; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7] 4878; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7] 4879; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4880; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4881; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] 4882; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3] 4883; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 4884; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 4885; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 4886; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4] 4887; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] 4888; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4889; AVX2-FCP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload 4890; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 4891; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4892; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4893; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 4894; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 4895; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] 4896; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 4897; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 4898; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7] 4899; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] 4900; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4901; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 4902; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] 4903; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 4904; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] 4905; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7] 4906; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 4907; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 4908; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] 4909; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 4910; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 4911; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 4912; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] 4913; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] 4914; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 4915; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm4 4916; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4917; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] 4918; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] 4919; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] 4920; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7] 4921; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 4922; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] 4923; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 4924; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm2 4925; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm3 4926; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 4927; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] 4928; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4929; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 4930; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 4931; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 4932; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4933; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] 4934; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 4935; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 4936; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 4937; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 4938; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm7 4939; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] 4940; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 4941; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 4942; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] 4943; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4944; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 4945; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] 4946; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 4947; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] 4948; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm9 4949; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 4950; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7] 4951; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 4952; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 4953; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] 4954; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 4955; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 4956; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 4957; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] 4958; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4959; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 4960; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 4961; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm2 4962; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 4963; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] 4964; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 4965; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 4966; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] 4967; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4968; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) 4969; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4970; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) 4971; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4972; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) 4973; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4974; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) 4975; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 4976; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) 4977; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx) 4978; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r8) 4979; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) 4980; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%r9) 4981; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) 4982; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4983; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) 4984; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) 4985; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8 4986; AVX2-FCP-NEXT: vzeroupper 4987; AVX2-FCP-NEXT: retq 4988; 4989; AVX512-LABEL: load_i16_stride6_vf32: 4990; AVX512: # %bb.0: 4991; AVX512-NEXT: subq $72, %rsp 4992; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 4993; AVX512-NEXT: vmovdqa 224(%rdi), %ymm14 4994; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 4995; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] 4996; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm1 4997; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 4998; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] 4999; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 5000; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] 5001; AVX512-NEXT: vmovdqa 160(%rdi), %ymm4 5002; AVX512-NEXT: vmovdqa (%rdi), %ymm13 5003; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 5004; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6 5005; AVX512-NEXT: vmovdqa 128(%rdi), %ymm7 5006; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] 5007; AVX512-NEXT: vmovdqa64 %ymm7, %ymm24 5008; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26 5009; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 5010; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7] 5011; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7] 5012; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 5013; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm7 5014; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5015; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 5016; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5017; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3] 5018; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] 5019; AVX512-NEXT: vmovdqa64 %ymm10, %ymm16 5020; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm7 5021; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 5022; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3] 5023; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] 5024; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7] 5025; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8 5026; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7] 5027; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 5028; AVX512-NEXT: vmovdqa64 %ymm12, %ymm28 5029; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 5030; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] 5031; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill 5032; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8 5033; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10 5034; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] 5035; AVX512-NEXT: vmovdqa64 %ymm10, %ymm18 5036; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20 5037; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 5038; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] 5039; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] 5040; AVX512-NEXT: vpshufb %xmm9, %xmm10, %xmm9 5041; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 5042; AVX512-NEXT: vmovdqa 256(%rdi), %ymm9 5043; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3] 5044; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm15 5045; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7] 5046; AVX512-NEXT: vmovdqa64 %ymm15, %ymm25 5047; AVX512-NEXT: vmovdqa64 %ymm12, %ymm27 5048; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 5049; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] 5050; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] 5051; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] 5052; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30 5053; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 5054; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 5055; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 5056; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 5057; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 5058; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4 5059; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5060; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] 5061; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5062; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 5063; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5064; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm0 5065; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm1 5066; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 5067; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 5068; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7] 5069; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm0 5070; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] 5071; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 5072; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5073; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 5074; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 5075; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 5076; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5077; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 5078; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] 5079; AVX512-NEXT: vmovdqa64 %ymm14, %ymm31 5080; AVX512-NEXT: vmovdqa64 %ymm11, %ymm21 5081; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 5082; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] 5083; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] 5084; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7] 5085; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 5086; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] 5087; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 5088; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 5089; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2 5090; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 5091; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 5092; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] 5093; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] 5094; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 5095; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] 5096; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4] 5097; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 5098; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5099; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 5100; AVX512-NEXT: vmovdqa64 %ymm16, %ymm23 5101; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0 5102; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] 5103; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19 5104; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 5105; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] 5106; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] 5107; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] 5108; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 5109; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] 5110; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 5111; AVX512-NEXT: vmovdqa64 %ymm29, %ymm13 5112; AVX512-NEXT: vmovdqa64 %ymm28, %ymm12 5113; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] 5114; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 5115; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 5116; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5117; AVX512-NEXT: vmovdqa64 %ymm18, %ymm28 5118; AVX512-NEXT: vmovdqa64 %ymm20, %ymm29 5119; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 5120; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 5121; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 5122; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 5123; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] 5124; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] 5125; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 5126; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] 5127; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] 5128; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] 5129; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 5130; AVX512-NEXT: vmovdqa64 %ymm25, %ymm0 5131; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1 5132; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] 5133; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 5134; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] 5135; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 5136; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] 5137; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 5138; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5139; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm0 & (zmm2 ^ zmm17)) 5140; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 5141; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm2)) 5142; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] 5143; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] 5144; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] 5145; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] 5146; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] 5147; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] 5148; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] 5149; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] 5150; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5151; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 5152; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30 5153; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm18 5154; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] 5155; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] 5156; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] 5157; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] 5158; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 5159; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] 5160; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 5161; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] 5162; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 5163; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] 5164; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 5165; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 5166; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 5167; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 5168; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] 5169; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 5170; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm0 & (zmm5 ^ zmm2)) 5171; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 5172; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5)) 5173; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 5174; AVX512-NEXT: vmovdqa64 %ymm31, %ymm0 5175; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 5176; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 5177; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm2 5178; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 5179; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] 5180; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 5181; AVX512-NEXT: vmovdqa64 %ymm24, %ymm3 5182; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 5183; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] 5184; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 5185; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] 5186; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] 5187; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] 5188; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 5189; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm5 5190; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] 5191; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 5192; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 5193; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] 5194; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 5195; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8 5196; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] 5197; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm8 5198; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 5199; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] 5200; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] 5201; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5202; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 5203; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm11) | ymm10 5204; AVX512-NEXT: movw $31, %ax 5205; AVX512-NEXT: kmovw %eax, %k1 5206; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} 5207; AVX512-NEXT: vmovdqa64 %ymm28, %ymm8 5208; AVX512-NEXT: vmovdqa64 %ymm29, %ymm10 5209; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7] 5210; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm8 5211; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm12 5212; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] 5213; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] 5214; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] 5215; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] 5216; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 5217; AVX512-NEXT: vmovdqa64 %ymm25, %ymm12 5218; AVX512-NEXT: vmovdqa64 %ymm27, %ymm13 5219; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] 5220; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 5221; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] 5222; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 5223; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 5224; AVX512-NEXT: vpshufb %xmm14, %xmm0, %xmm0 5225; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 5226; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5227; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] 5228; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 5229; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4 5230; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] 5231; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] 5232; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] 5233; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5234; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 5235; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm3 5236; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] 5237; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 5238; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 5239; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5240; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 5241; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} 5242; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 5243; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] 5244; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] 5245; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] 5246; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5247; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 5248; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] 5249; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 5250; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5251; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload 5252; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload 5253; AVX512-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) 5254; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 5255; AVX512-NEXT: kmovw %eax, %k1 5256; AVX512-NEXT: vmovdqa32 %zmm30, %zmm4 {%k1} 5257; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) 5258; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload 5259; AVX512-NEXT: # zmm15 = mem ^ (zmm3 & (zmm15 ^ mem)) 5260; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} 5261; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) 5262; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 5263; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ zmm2)) 5264; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0)) 5265; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) 5266; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) 5267; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) 5268; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) 5269; AVX512-NEXT: addq $72, %rsp 5270; AVX512-NEXT: vzeroupper 5271; AVX512-NEXT: retq 5272; 5273; AVX512-FCP-LABEL: load_i16_stride6_vf32: 5274; AVX512-FCP: # %bb.0: 5275; AVX512-FCP-NEXT: subq $136, %rsp 5276; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 5277; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 5278; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 5279; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7] 5280; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 5281; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 5282; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3] 5283; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm3 5284; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 5285; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 5286; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 5287; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 5288; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 5289; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 5290; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7] 5291; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 5292; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 5293; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm5 5294; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 5295; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8 5296; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] 5297; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 5298; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 5299; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5300; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7] 5301; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 5302; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 5303; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm8 5304; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 5305; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] 5306; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 5307; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7] 5308; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] 5309; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8 5310; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7] 5311; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm27 5312; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 5313; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 5314; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7] 5315; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5316; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 5317; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 5318; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 5319; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 5320; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 5321; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 5322; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 5323; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9 5324; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] 5325; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 5326; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 5327; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] 5328; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm14 5329; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] 5330; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 5331; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 5332; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 5333; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 5334; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 5335; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 5336; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 5337; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 5338; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 5339; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm10 5340; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7] 5341; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 5342; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 5343; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5344; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] 5345; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5346; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 5347; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5348; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 5349; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 5350; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 5351; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 5352; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 5353; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 5354; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0 5355; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5356; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 5357; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5358; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 5359; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 5360; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 5361; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5362; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 5363; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] 5364; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm30 5365; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm31 5366; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 5367; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] 5368; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] 5369; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] 5370; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] 5371; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] 5372; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 5373; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] 5374; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 5375; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 5376; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] 5377; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 5378; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 5379; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] 5380; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] 5381; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] 5382; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5383; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 5384; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm23 5385; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 5386; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 5387; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 5388; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 5389; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 5390; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] 5391; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 5392; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 5393; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] 5394; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 5395; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 5396; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] 5397; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 5398; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 5399; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] 5400; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm27 5401; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm15 5402; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 5403; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] 5404; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm16 5405; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] 5406; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 5407; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1] 5408; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] 5409; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] 5410; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 5411; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 5412; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 5413; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] 5414; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 5415; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] 5416; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 5417; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] 5418; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 5419; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5420; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm17 ^ (zmm0 & (zmm11 ^ zmm17)) 5421; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 5422; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm11)) 5423; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] 5424; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] 5425; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] 5426; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 5427; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 5428; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] 5429; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] 5430; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5431; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 5432; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] 5433; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 5434; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] 5435; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 5436; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 5437; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 5438; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 5439; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 5440; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 5441; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] 5442; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] 5443; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5444; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 5445; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 5446; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] 5447; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 5448; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7)) 5449; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 5450; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4)) 5451; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 5452; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 5453; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 5454; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 5455; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0 5456; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 5457; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] 5458; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 5459; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 5460; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 5461; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 5462; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 5463; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] 5464; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 5465; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm2 5466; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm7 5467; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] 5468; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5469; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2 5470; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 5471; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 5472; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9 5473; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] 5474; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 5475; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 5476; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] 5477; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] 5478; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5479; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 5480; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm11) | ymm10 5481; AVX512-FCP-NEXT: movw $31, %ax 5482; AVX512-FCP-NEXT: kmovw %eax, %k1 5483; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} 5484; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 5485; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] 5486; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm5 5487; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm13 5488; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] 5489; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 5490; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7] 5491; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5492; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 5493; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 5494; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] 5495; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 5496; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] 5497; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 5498; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 5499; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 5500; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 5501; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] 5502; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5503; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm11) 5504; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 5505; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 5506; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] 5507; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 5508; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 5509; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 5510; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] 5511; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 5512; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 5513; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 5514; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm0 5515; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3 5516; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] 5517; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5518; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 5519; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 5520; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 5521; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5522; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 5523; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload 5524; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) 5525; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 5526; AVX512-FCP-NEXT: kmovw %eax, %k1 5527; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} 5528; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) 5529; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload 5530; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload 5531; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) 5532; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} 5533; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) 5534; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5535; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm2)) 5536; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm1)) 5537; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) 5538; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) 5539; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) 5540; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 5541; AVX512-FCP-NEXT: addq $136, %rsp 5542; AVX512-FCP-NEXT: vzeroupper 5543; AVX512-FCP-NEXT: retq 5544; 5545; AVX512DQ-LABEL: load_i16_stride6_vf32: 5546; AVX512DQ: # %bb.0: 5547; AVX512DQ-NEXT: pushq %rax 5548; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 5549; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13 5550; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 5551; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] 5552; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25 5553; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 5554; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9 5555; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] 5556; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 5557; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] 5558; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm2 5559; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5 5560; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6 5561; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 5562; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm7 5563; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] 5564; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm20 5565; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 5566; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 5567; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] 5568; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7] 5569; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 5570; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm2 5571; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5572; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 5573; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] 5574; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] 5575; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm18 5576; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 5577; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 5578; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm6 5579; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3] 5580; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] 5581; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7] 5582; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm12 5583; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] 5584; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm28 5585; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 5586; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7] 5587; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 5588; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm5 5589; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] 5590; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21 5591; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 5592; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 5593; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7] 5594; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] 5595; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm7 5596; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8 5597; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm7 5598; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] 5599; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 5600; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] 5601; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm24 5602; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26 5603; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 5604; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] 5605; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 5606; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] 5607; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5608; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16)) 5609; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 5610; AVX512DQ-NEXT: kmovw %eax, %k1 5611; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} 5612; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5613; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 5614; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm8 5615; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 5616; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7] 5617; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 5618; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm8 5619; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 5620; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7] 5621; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5622; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3 5623; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 5624; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 5625; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7] 5626; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 5627; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] 5628; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0 5629; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5630; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 5631; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm17 & (zmm2 ^ zmm3)) 5632; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5633; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 5634; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 5635; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 5636; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5637; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} 5638; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5639; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0 5640; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] 5641; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29 5642; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm30 5643; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] 5644; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 5645; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] 5646; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7] 5647; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 5648; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] 5649; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 5650; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 5651; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 5652; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 5653; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 5654; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] 5655; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] 5656; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 5657; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] 5658; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] 5659; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 5660; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5661; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 5662; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm25 5663; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm13 5664; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 5665; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] 5666; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 5667; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] 5668; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] 5669; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] 5670; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 5671; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] 5672; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 5673; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm10 5674; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7] 5675; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 5676; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 5677; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5678; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0 5679; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm1 5680; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 5681; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 5682; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] 5683; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] 5684; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 5685; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 5686; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4] 5687; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] 5688; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 5689; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 5690; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2 5691; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7] 5692; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 5693; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] 5694; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 5695; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 5696; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 5697; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5698; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16)) 5699; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 5700; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9)) 5701; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] 5702; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] 5703; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] 5704; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] 5705; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] 5706; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7] 5707; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] 5708; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] 5709; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5710; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 5711; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] 5712; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] 5713; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] 5714; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] 5715; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 5716; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] 5717; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 5718; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] 5719; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] 5720; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 5721; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] 5722; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5723; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 5724; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 5725; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] 5726; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 5727; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7)) 5728; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 5729; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm4)) 5730; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 5731; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 5732; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 5733; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 5734; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2 5735; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 5736; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] 5737; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 5738; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 5739; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 5740; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] 5741; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm2 5742; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1] 5743; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7] 5744; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] 5745; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 5746; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm5 5747; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] 5748; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5749; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 5750; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] 5751; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm5 5752; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] 5753; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm6 5754; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13 5755; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7] 5756; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] 5757; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5758; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 5759; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm8 5760; AVX512DQ-NEXT: movw $31, %ax 5761; AVX512DQ-NEXT: kmovw %eax, %k1 5762; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1} 5763; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 5764; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm8 5765; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] 5766; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 5767; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm11 5768; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1] 5769; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7] 5770; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] 5771; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7] 5772; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5773; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm11 5774; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10 5775; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7] 5776; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 5777; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] 5778; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 5779; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 5780; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm0 5781; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 5782; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5783; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] 5784; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 5785; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 5786; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] 5787; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] 5788; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7] 5789; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5790; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 5791; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm2 5792; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3] 5793; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 5794; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] 5795; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5796; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm14) | ymm3 5797; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} 5798; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 5799; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] 5800; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] 5801; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] 5802; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5803; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 5804; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 5805; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 5806; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 5807; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) 5808; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 5809; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) 5810; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 5811; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm4)) 5812; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0)) 5813; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx) 5814; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) 5815; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) 5816; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) 5817; AVX512DQ-NEXT: popq %rax 5818; AVX512DQ-NEXT: vzeroupper 5819; AVX512DQ-NEXT: retq 5820; 5821; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32: 5822; AVX512DQ-FCP: # %bb.0: 5823; AVX512DQ-FCP-NEXT: pushq %rax 5824; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 5825; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 5826; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 5827; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] 5828; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 5829; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 5830; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 5831; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] 5832; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm2 5833; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] 5834; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 5835; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 5836; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 5837; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 5838; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 5839; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] 5840; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 5841; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 5842; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 5843; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4 5844; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm7 5845; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm8 5846; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7] 5847; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5848; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 5849; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] 5850; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm18 5851; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 5852; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 5853; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 5854; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] 5855; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 5856; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] 5857; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] 5858; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 5859; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] 5860; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 5861; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 5862; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7] 5863; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 5864; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 5865; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 5866; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 5867; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 5868; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm11 5869; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 5870; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 5871; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] 5872; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 5873; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 5874; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] 5875; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 5876; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] 5877; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm23 5878; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 5879; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 5880; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] 5881; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 5882; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] 5883; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5884; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16)) 5885; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 5886; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 5887; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} 5888; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5889; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 5890; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 5891; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 5892; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7] 5893; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 5894; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 5895; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5] 5896; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] 5897; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5898; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm5 5899; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 5900; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 5901; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7] 5902; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 5903; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] 5904; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0 5905; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] 5906; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 5907; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm17 & (zmm2 ^ zmm5)) 5908; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5909; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 5910; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 5911; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 5912; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 5913; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} 5914; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5915; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 5916; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] 5917; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm29 5918; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm30 5919; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] 5920; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 5921; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] 5922; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] 5923; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] 5924; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 5925; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 5926; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 5927; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] 5928; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 5929; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] 5930; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 5931; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm3 5932; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] 5933; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] 5934; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] 5935; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5936; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 5937; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm24 5938; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 5939; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 5940; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] 5941; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 5942; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 5943; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] 5944; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 5945; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] 5946; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] 5947; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 5948; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7] 5949; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 5950; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 5951; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7] 5952; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 5953; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 5954; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 5955; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm17 5956; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] 5957; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 5958; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1] 5959; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4] 5960; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] 5961; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 5962; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 5963; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 5964; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] 5965; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 5966; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] 5967; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 5968; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 5969; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 5970; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 5971; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16)) 5972; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 5973; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9)) 5974; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] 5975; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] 5976; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] 5977; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 5978; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm11 5979; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] 5980; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7] 5981; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5982; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 5983; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] 5984; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] 5985; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] 5986; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 5987; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] 5988; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 5989; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm4 5990; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] 5991; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] 5992; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5993; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 5994; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 5995; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] 5996; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 5997; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm0 & (zmm3 ^ zmm7)) 5998; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 5999; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3)) 6000; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 6001; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 6002; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 6003; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 6004; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 6005; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1 6006; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7] 6007; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 6008; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 6009; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 6010; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] 6011; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 6012; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] 6013; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 6014; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 6015; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 6016; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] 6017; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6018; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 6019; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] 6020; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 6021; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] 6022; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 6023; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 6024; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] 6025; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] 6026; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6027; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 6028; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm11 6029; AVX512DQ-FCP-NEXT: movw $31, %ax 6030; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 6031; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1} 6032; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm5 6033; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm11 6034; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] 6035; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm5 6036; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm14 6037; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] 6038; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 6039; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] 6040; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6041; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm14 6042; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 6043; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7] 6044; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] 6045; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] 6046; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 6047; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 6048; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 6049; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 6050; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7] 6051; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6052; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm8 & ymm13) 6053; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 6054; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 6055; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] 6056; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 6057; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 6058; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 6059; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] 6060; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 6061; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 6062; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1} 6063; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm2 6064; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 6065; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] 6066; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6067; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 6068; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 6069; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 6070; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 6071; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) 6072; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 6073; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) 6074; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6075; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm17 & (zmm7 ^ zmm0)) 6076; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm17 & (zmm2 ^ zmm1)) 6077; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) 6078; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) 6079; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) 6080; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) 6081; AVX512DQ-FCP-NEXT: popq %rax 6082; AVX512DQ-FCP-NEXT: vzeroupper 6083; AVX512DQ-FCP-NEXT: retq 6084; 6085; AVX512BW-LABEL: load_i16_stride6_vf32: 6086; AVX512BW: # %bb.0: 6087; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6088; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 6089; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 6090; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 6091; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 6092; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 6093; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 6094; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 6095; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 6096; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 6097; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 6098; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6099; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 6100; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 6101; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 6102; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 6103; AVX512BW-NEXT: kmovd %edi, %k1 6104; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} 6105; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 6106; AVX512BW-NEXT: kmovd %edi, %k2 6107; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} 6108; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 6109; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6110; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6111; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 6112; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6113; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 6114; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 6115; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 6116; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 6117; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} 6118; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 6119; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6120; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6121; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 6122; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6123; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6124; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 6125; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 6126; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 6127; AVX512BW-NEXT: kmovd %edi, %k2 6128; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} 6129; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 6130; AVX512BW-NEXT: kmovd %edi, %k1 6131; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k1} 6132; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 6133; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6134; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6135; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 6136; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6137; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6138; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 6139; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 6140; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} 6141; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} 6142; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 6143; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6144; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 6145; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6146; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6147; AVX512BW-NEXT: movw $31, %di 6148; AVX512BW-NEXT: kmovd %edi, %k2 6149; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 6150; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 6151; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6152; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 6153; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 6154; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 6155; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6156; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 6157; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 6158; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 6159; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} 6160; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 6161; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 6162; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6163; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 6164; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi) 6165; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) 6166; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) 6167; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) 6168; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r9) 6169; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) 6170; AVX512BW-NEXT: vzeroupper 6171; AVX512BW-NEXT: retq 6172; 6173; AVX512BW-FCP-LABEL: load_i16_stride6_vf32: 6174; AVX512BW-FCP: # %bb.0: 6175; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6176; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 6177; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 6178; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 6179; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 6180; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 6181; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 6182; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 6183; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 6184; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 6185; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 6186; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6187; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 6188; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 6189; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 6190; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 6191; AVX512BW-FCP-NEXT: kmovd %edi, %k1 6192; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} 6193; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 6194; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6195; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} 6196; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 6197; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6198; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6199; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 6200; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6201; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 6202; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 6203; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 6204; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 6205; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} 6206; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 6207; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6208; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6209; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 6210; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6211; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6212; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 6213; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 6214; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 6215; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6216; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} 6217; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 6218; AVX512BW-FCP-NEXT: kmovd %edi, %k1 6219; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k1} 6220; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 6221; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6222; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6223; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 6224; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6225; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6226; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 6227; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 6228; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} 6229; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} 6230; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 6231; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6232; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 6233; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6234; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6235; AVX512BW-FCP-NEXT: movw $31, %di 6236; AVX512BW-FCP-NEXT: kmovd %edi, %k2 6237; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 6238; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 6239; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6240; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 6241; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 6242; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 6243; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6244; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 6245; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 6246; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 6247; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} 6248; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 6249; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 6250; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6251; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 6252; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 6253; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 6254; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 6255; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 6256; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%r9) 6257; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 6258; AVX512BW-FCP-NEXT: vzeroupper 6259; AVX512BW-FCP-NEXT: retq 6260; 6261; AVX512DQ-BW-LABEL: load_i16_stride6_vf32: 6262; AVX512DQ-BW: # %bb.0: 6263; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6264; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 6265; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 6266; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 6267; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 6268; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 6269; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 6270; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 6271; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 6272; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 6273; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 6274; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6275; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 6276; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 6277; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 6278; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 6279; AVX512DQ-BW-NEXT: kmovd %edi, %k1 6280; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} 6281; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 6282; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6283; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} 6284; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 6285; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6286; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6287; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 6288; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6289; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 6290; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 6291; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 6292; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 6293; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} 6294; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 6295; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6296; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6297; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 6298; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6299; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6300; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 6301; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 6302; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 6303; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6304; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} 6305; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 6306; AVX512DQ-BW-NEXT: kmovd %edi, %k1 6307; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k1} 6308; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 6309; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6310; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6311; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 6312; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6313; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6314; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 6315; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 6316; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} 6317; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} 6318; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 6319; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6320; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 6321; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6322; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6323; AVX512DQ-BW-NEXT: movw $31, %di 6324; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6325; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 6326; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 6327; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6328; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 6329; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 6330; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 6331; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6332; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 6333; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 6334; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 6335; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} 6336; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 6337; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 6338; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6339; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 6340; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rsi) 6341; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rdx) 6342; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) 6343; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8) 6344; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%r9) 6345; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) 6346; AVX512DQ-BW-NEXT: vzeroupper 6347; AVX512DQ-BW-NEXT: retq 6348; 6349; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf32: 6350; AVX512DQ-BW-FCP: # %bb.0: 6351; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 6353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 6354; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 6355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 6356; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 6357; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 6358; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 6359; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 6360; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 6361; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 6362; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6363; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 6364; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 6365; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 6366; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 6367; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 6368; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} 6369; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 6370; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6371; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} 6372; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 6373; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6374; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6375; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 6376; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6377; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 6378; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 6379; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 6380; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} 6381; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} 6382; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 6383; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6384; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6385; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 6386; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6387; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6388; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 6389; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 6390; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 6391; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6392; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} 6393; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 6394; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 6395; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k1} 6396; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 6397; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6398; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 6399; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 6400; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6401; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6402; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 6403; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 6404; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} 6405; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} 6406; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 6407; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6408; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 6409; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] 6410; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 6411; AVX512DQ-BW-FCP-NEXT: movw $31, %di 6412; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 6413; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} 6414; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 6415; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 6416; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 6417; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 6418; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 6419; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 6420; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 6421; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 6422; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 6423; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} 6424; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 6425; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 6426; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 6427; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} 6428; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 6429; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) 6430; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) 6431; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) 6432; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r9) 6433; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 6434; AVX512DQ-BW-FCP-NEXT: vzeroupper 6435; AVX512DQ-BW-FCP-NEXT: retq 6436 %wide.vec = load <192 x i16>, ptr %in.vec, align 64 6437 %strided.vec0 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186> 6438 %strided.vec1 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187> 6439 %strided.vec2 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188> 6440 %strided.vec3 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189> 6441 %strided.vec4 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190> 6442 %strided.vec5 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191> 6443 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64 6444 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64 6445 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64 6446 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64 6447 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64 6448 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64 6449 ret void 6450} 6451 6452define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 6453; SSE-LABEL: load_i16_stride6_vf64: 6454; SSE: # %bb.0: 6455; SSE-NEXT: subq $1176, %rsp # imm = 0x498 6456; SSE-NEXT: movdqa 496(%rdi), %xmm5 6457; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6458; SSE-NEXT: movdqa 512(%rdi), %xmm8 6459; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6460; SSE-NEXT: movdqa 144(%rdi), %xmm7 6461; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6462; SSE-NEXT: movdqa 160(%rdi), %xmm3 6463; SSE-NEXT: movdqa 176(%rdi), %xmm0 6464; SSE-NEXT: movdqa 112(%rdi), %xmm6 6465; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6466; SSE-NEXT: movdqa 96(%rdi), %xmm4 6467; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6468; SSE-NEXT: movdqa 128(%rdi), %xmm1 6469; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6470; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] 6471; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] 6472; SSE-NEXT: movdqa %xmm9, %xmm2 6473; SSE-NEXT: pandn %xmm1, %xmm2 6474; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] 6475; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6476; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 6477; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 6478; SSE-NEXT: pand %xmm9, %xmm1 6479; SSE-NEXT: por %xmm2, %xmm1 6480; SSE-NEXT: movdqa %xmm1, %xmm2 6481; SSE-NEXT: movdqa %xmm3, %xmm1 6482; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] 6483; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 6484; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6485; SSE-NEXT: movdqa %xmm0, %xmm4 6486; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] 6487; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6488; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6489; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] 6490; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] 6491; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6492; SSE-NEXT: pslld $16, %xmm0 6493; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6494; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6495; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3] 6496; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6497; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 6498; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] 6499; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 6500; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6501; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] 6502; SSE-NEXT: movdqa %xmm9, %xmm1 6503; SSE-NEXT: pandn %xmm0, %xmm1 6504; SSE-NEXT: movdqa 480(%rdi), %xmm0 6505; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6506; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6507; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6508; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6509; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] 6510; SSE-NEXT: pand %xmm9, %xmm0 6511; SSE-NEXT: por %xmm1, %xmm0 6512; SSE-NEXT: movdqa %xmm0, %xmm2 6513; SSE-NEXT: movdqa 544(%rdi), %xmm3 6514; SSE-NEXT: movdqa 560(%rdi), %xmm1 6515; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] 6516; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6517; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6518; SSE-NEXT: movdqa %xmm1, %xmm0 6519; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] 6520; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6521; SSE-NEXT: movdqa %xmm3, %xmm0 6522; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6523; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] 6524; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 6525; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6526; SSE-NEXT: pslld $16, %xmm1 6527; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6528; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6529; SSE-NEXT: movdqa 528(%rdi), %xmm1 6530; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6531; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 6532; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6533; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] 6534; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] 6535; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 6536; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6537; SSE-NEXT: movdqa 32(%rdi), %xmm0 6538; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6539; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6540; SSE-NEXT: movdqa %xmm9, %xmm1 6541; SSE-NEXT: pandn %xmm0, %xmm1 6542; SSE-NEXT: movdqa (%rdi), %xmm0 6543; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6544; SSE-NEXT: movdqa 16(%rdi), %xmm6 6545; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6546; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6547; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6548; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] 6549; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6550; SSE-NEXT: pand %xmm9, %xmm0 6551; SSE-NEXT: por %xmm1, %xmm0 6552; SSE-NEXT: movdqa %xmm0, %xmm2 6553; SSE-NEXT: movdqa 64(%rdi), %xmm3 6554; SSE-NEXT: movdqa 80(%rdi), %xmm0 6555; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] 6556; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6557; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6558; SSE-NEXT: movdqa %xmm0, %xmm1 6559; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] 6560; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6561; SSE-NEXT: movdqa %xmm3, %xmm1 6562; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill 6563; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] 6564; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] 6565; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6566; SSE-NEXT: pslld $16, %xmm0 6567; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6568; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6569; SSE-NEXT: movdqa 48(%rdi), %xmm0 6570; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6571; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 6572; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6573; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 6574; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] 6575; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 6576; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6577; SSE-NEXT: movdqa 416(%rdi), %xmm0 6578; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6579; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6580; SSE-NEXT: movdqa %xmm9, %xmm1 6581; SSE-NEXT: pandn %xmm0, %xmm1 6582; SSE-NEXT: movdqa 400(%rdi), %xmm2 6583; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6584; SSE-NEXT: movdqa 384(%rdi), %xmm0 6585; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6586; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6587; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6588; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6589; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6590; SSE-NEXT: pand %xmm9, %xmm0 6591; SSE-NEXT: por %xmm1, %xmm0 6592; SSE-NEXT: movdqa %xmm0, %xmm2 6593; SSE-NEXT: movdqa 448(%rdi), %xmm3 6594; SSE-NEXT: movdqa 464(%rdi), %xmm0 6595; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] 6596; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] 6597; SSE-NEXT: movdqa %xmm0, %xmm1 6598; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] 6599; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6600; SSE-NEXT: movdqa %xmm3, %xmm1 6601; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6602; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] 6603; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] 6604; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6605; SSE-NEXT: pslld $16, %xmm0 6606; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6607; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6608; SSE-NEXT: movdqa 432(%rdi), %xmm0 6609; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6610; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 6611; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6612; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 6613; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] 6614; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 6615; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6616; SSE-NEXT: movdqa 320(%rdi), %xmm0 6617; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6618; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6619; SSE-NEXT: movdqa %xmm9, %xmm1 6620; SSE-NEXT: pandn %xmm0, %xmm1 6621; SSE-NEXT: movdqa 304(%rdi), %xmm2 6622; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6623; SSE-NEXT: movdqa 288(%rdi), %xmm0 6624; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6625; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6626; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6627; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6628; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6629; SSE-NEXT: pand %xmm9, %xmm0 6630; SSE-NEXT: por %xmm1, %xmm0 6631; SSE-NEXT: movdqa %xmm0, %xmm2 6632; SSE-NEXT: movdqa 352(%rdi), %xmm3 6633; SSE-NEXT: movdqa 368(%rdi), %xmm0 6634; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] 6635; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] 6636; SSE-NEXT: movdqa %xmm0, %xmm1 6637; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] 6638; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6639; SSE-NEXT: movdqa %xmm3, %xmm1 6640; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6641; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] 6642; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] 6643; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6644; SSE-NEXT: pslld $16, %xmm0 6645; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6646; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6647; SSE-NEXT: movdqa 336(%rdi), %xmm0 6648; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6649; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 6650; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6651; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 6652; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] 6653; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 6654; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6655; SSE-NEXT: movdqa 704(%rdi), %xmm0 6656; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6657; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6658; SSE-NEXT: movdqa %xmm9, %xmm1 6659; SSE-NEXT: pandn %xmm0, %xmm1 6660; SSE-NEXT: movdqa 688(%rdi), %xmm2 6661; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6662; SSE-NEXT: movdqa 672(%rdi), %xmm0 6663; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6664; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6665; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6666; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6667; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6668; SSE-NEXT: pand %xmm9, %xmm0 6669; SSE-NEXT: por %xmm1, %xmm0 6670; SSE-NEXT: movdqa %xmm0, %xmm2 6671; SSE-NEXT: movdqa 736(%rdi), %xmm3 6672; SSE-NEXT: movdqa 752(%rdi), %xmm1 6673; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] 6674; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] 6675; SSE-NEXT: movdqa %xmm1, %xmm0 6676; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] 6677; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6678; SSE-NEXT: movdqa %xmm3, %xmm0 6679; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6680; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] 6681; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] 6682; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6683; SSE-NEXT: pslld $16, %xmm1 6684; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6685; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 6686; SSE-NEXT: movdqa 720(%rdi), %xmm1 6687; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6688; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 6689; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6690; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] 6691; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] 6692; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 6693; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6694; SSE-NEXT: movdqa 224(%rdi), %xmm0 6695; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6696; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6697; SSE-NEXT: movdqa %xmm9, %xmm1 6698; SSE-NEXT: pandn %xmm0, %xmm1 6699; SSE-NEXT: movdqa 208(%rdi), %xmm2 6700; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6701; SSE-NEXT: movdqa 192(%rdi), %xmm0 6702; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6703; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 6704; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6705; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6706; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6707; SSE-NEXT: pand %xmm9, %xmm0 6708; SSE-NEXT: por %xmm1, %xmm0 6709; SSE-NEXT: movdqa %xmm0, %xmm1 6710; SSE-NEXT: movdqa 256(%rdi), %xmm4 6711; SSE-NEXT: movdqa 272(%rdi), %xmm0 6712; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] 6713; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 6714; SSE-NEXT: movdqa %xmm0, %xmm2 6715; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] 6716; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6717; SSE-NEXT: movdqa %xmm4, %xmm2 6718; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6719; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] 6720; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] 6721; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6722; SSE-NEXT: pslld $16, %xmm0 6723; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6724; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 6725; SSE-NEXT: movdqa 240(%rdi), %xmm0 6726; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6727; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 6728; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6729; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 6730; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] 6731; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 6732; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6733; SSE-NEXT: movdqa 608(%rdi), %xmm0 6734; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6735; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 6736; SSE-NEXT: movdqa %xmm9, %xmm2 6737; SSE-NEXT: pandn %xmm0, %xmm2 6738; SSE-NEXT: movdqa 592(%rdi), %xmm13 6739; SSE-NEXT: movdqa 576(%rdi), %xmm0 6740; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6741; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] 6742; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] 6743; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] 6744; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6745; SSE-NEXT: pand %xmm9, %xmm0 6746; SSE-NEXT: por %xmm2, %xmm0 6747; SSE-NEXT: movdqa %xmm0, %xmm1 6748; SSE-NEXT: movdqa 640(%rdi), %xmm5 6749; SSE-NEXT: movdqa 656(%rdi), %xmm2 6750; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] 6751; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 6752; SSE-NEXT: movdqa %xmm2, %xmm0 6753; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] 6754; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6755; SSE-NEXT: movdqa %xmm5, %xmm0 6756; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6757; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] 6758; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] 6759; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6760; SSE-NEXT: pslld $16, %xmm2 6761; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6762; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 6763; SSE-NEXT: movdqa 624(%rdi), %xmm2 6764; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6765; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] 6766; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7] 6767; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] 6768; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 6769; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6770; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6771; SSE-NEXT: movdqa %xmm11, %xmm0 6772; SSE-NEXT: psrld $16, %xmm0 6773; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6774; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6775; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6776; SSE-NEXT: movdqa %xmm9, %xmm0 6777; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6778; SSE-NEXT: pandn %xmm2, %xmm0 6779; SSE-NEXT: pand %xmm9, %xmm1 6780; SSE-NEXT: por %xmm0, %xmm1 6781; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6782; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6783; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6784; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] 6785; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] 6786; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6787; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6788; SSE-NEXT: psrld $16, %xmm0 6789; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6790; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6791; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6792; SSE-NEXT: movdqa %xmm9, %xmm0 6793; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6794; SSE-NEXT: pand %xmm9, %xmm1 6795; SSE-NEXT: por %xmm0, %xmm1 6796; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6797; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6798; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6799; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] 6800; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] 6801; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6802; SSE-NEXT: movdqa %xmm6, %xmm0 6803; SSE-NEXT: psrld $16, %xmm0 6804; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6805; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6806; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6807; SSE-NEXT: movdqa %xmm9, %xmm0 6808; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6809; SSE-NEXT: pandn %xmm6, %xmm0 6810; SSE-NEXT: pand %xmm9, %xmm1 6811; SSE-NEXT: por %xmm0, %xmm1 6812; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6813; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6814; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6815; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] 6816; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] 6817; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6818; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6819; SSE-NEXT: psrld $16, %xmm0 6820; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6821; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6822; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6823; SSE-NEXT: movdqa %xmm9, %xmm0 6824; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6825; SSE-NEXT: pand %xmm9, %xmm1 6826; SSE-NEXT: por %xmm0, %xmm1 6827; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6828; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6829; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] 6830; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] 6831; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6832; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6833; SSE-NEXT: movdqa %xmm12, %xmm0 6834; SSE-NEXT: psrld $16, %xmm0 6835; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6836; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6837; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6838; SSE-NEXT: movdqa %xmm9, %xmm0 6839; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6840; SSE-NEXT: pandn %xmm5, %xmm0 6841; SSE-NEXT: pand %xmm9, %xmm1 6842; SSE-NEXT: por %xmm0, %xmm1 6843; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6844; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6845; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] 6846; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] 6847; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6848; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6849; SSE-NEXT: psrld $16, %xmm0 6850; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6851; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6852; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6853; SSE-NEXT: movdqa %xmm9, %xmm0 6854; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6855; SSE-NEXT: pand %xmm9, %xmm1 6856; SSE-NEXT: por %xmm0, %xmm1 6857; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6858; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6859; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] 6860; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] 6861; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6862; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6863; SSE-NEXT: movdqa %xmm15, %xmm0 6864; SSE-NEXT: psrld $16, %xmm0 6865; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6866; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] 6867; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 6868; SSE-NEXT: movdqa %xmm9, %xmm0 6869; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6870; SSE-NEXT: pandn %xmm10, %xmm0 6871; SSE-NEXT: pand %xmm9, %xmm1 6872; SSE-NEXT: por %xmm0, %xmm1 6873; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6874; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] 6875; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] 6876; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] 6877; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6878; SSE-NEXT: psrld $16, %xmm13 6879; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] 6880; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] 6881; SSE-NEXT: pand %xmm9, %xmm3 6882; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 6883; SSE-NEXT: por %xmm3, %xmm9 6884; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] 6885; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] 6886; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] 6887; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6888; SSE-NEXT: movdqa %xmm2, %xmm0 6889; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6890; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6891; SSE-NEXT: # xmm1 = mem[1,1,1,1] 6892; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 6893; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] 6894; SSE-NEXT: movdqa %xmm4, %xmm2 6895; SSE-NEXT: pandn %xmm0, %xmm2 6896; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6897; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] 6898; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] 6899; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 6900; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 6901; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] 6902; SSE-NEXT: pand %xmm4, %xmm3 6903; SSE-NEXT: por %xmm2, %xmm3 6904; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6905; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6906; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 6907; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6908; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 6909; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 6910; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] 6911; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] 6912; SSE-NEXT: movdqa %xmm14, %xmm0 6913; SSE-NEXT: pandn %xmm2, %xmm0 6914; SSE-NEXT: pand %xmm14, %xmm3 6915; SSE-NEXT: por %xmm3, %xmm0 6916; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6917; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6918; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6919; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] 6920; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] 6921; SSE-NEXT: movdqa %xmm4, %xmm3 6922; SSE-NEXT: pandn %xmm6, %xmm3 6923; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6924; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6925; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] 6926; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] 6927; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] 6928; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 6929; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 6930; SSE-NEXT: pand %xmm4, %xmm2 6931; SSE-NEXT: por %xmm3, %xmm2 6932; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 6933; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6934; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 6935; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 6936; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] 6937; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] 6938; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] 6939; SSE-NEXT: movdqa %xmm14, %xmm0 6940; SSE-NEXT: pandn %xmm3, %xmm0 6941; SSE-NEXT: pand %xmm14, %xmm2 6942; SSE-NEXT: por %xmm2, %xmm0 6943; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6944; SSE-NEXT: movdqa %xmm5, %xmm2 6945; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6946; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6947; SSE-NEXT: # xmm3 = mem[1,1,1,1] 6948; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 6949; SSE-NEXT: movdqa %xmm4, %xmm5 6950; SSE-NEXT: pandn %xmm2, %xmm5 6951; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6952; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] 6953; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] 6954; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 6955; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 6956; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 6957; SSE-NEXT: pand %xmm4, %xmm2 6958; SSE-NEXT: por %xmm5, %xmm2 6959; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6960; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6961; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 6962; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6963; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] 6964; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 6965; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 6966; SSE-NEXT: movdqa %xmm14, %xmm0 6967; SSE-NEXT: pandn %xmm5, %xmm0 6968; SSE-NEXT: pand %xmm14, %xmm2 6969; SSE-NEXT: por %xmm2, %xmm0 6970; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6971; SSE-NEXT: movdqa %xmm10, %xmm2 6972; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 6973; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6974; SSE-NEXT: # xmm5 = mem[1,1,1,1] 6975; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 6976; SSE-NEXT: movdqa %xmm4, %xmm5 6977; SSE-NEXT: pandn %xmm2, %xmm5 6978; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6979; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0] 6980; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3] 6981; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7] 6982; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 6983; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 6984; SSE-NEXT: pand %xmm4, %xmm2 6985; SSE-NEXT: por %xmm5, %xmm2 6986; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6987; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6988; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 6989; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6990; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] 6991; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 6992; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 6993; SSE-NEXT: movdqa %xmm14, %xmm0 6994; SSE-NEXT: pandn %xmm5, %xmm0 6995; SSE-NEXT: pand %xmm14, %xmm2 6996; SSE-NEXT: por %xmm2, %xmm0 6997; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6998; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6999; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7000; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 7001; SSE-NEXT: # xmm5 = mem[1,1,1,1] 7002; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 7003; SSE-NEXT: movdqa %xmm4, %xmm5 7004; SSE-NEXT: pandn %xmm2, %xmm5 7005; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7006; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7007; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 7008; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 7009; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] 7010; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] 7011; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] 7012; SSE-NEXT: pand %xmm4, %xmm6 7013; SSE-NEXT: por %xmm5, %xmm6 7014; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7015; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7016; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 7017; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7018; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] 7019; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 7020; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 7021; SSE-NEXT: movdqa %xmm14, %xmm0 7022; SSE-NEXT: pandn %xmm5, %xmm0 7023; SSE-NEXT: pand %xmm14, %xmm6 7024; SSE-NEXT: por %xmm6, %xmm0 7025; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7026; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7027; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7028; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7029; SSE-NEXT: # xmm6 = mem[1,1,1,1] 7030; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 7031; SSE-NEXT: movdqa %xmm4, %xmm6 7032; SSE-NEXT: pandn %xmm5, %xmm6 7033; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 7034; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7035; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] 7036; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] 7037; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] 7038; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 7039; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] 7040; SSE-NEXT: pand %xmm4, %xmm5 7041; SSE-NEXT: por %xmm6, %xmm5 7042; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7043; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7044; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 7045; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7046; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] 7047; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] 7048; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 7049; SSE-NEXT: movdqa %xmm14, %xmm0 7050; SSE-NEXT: pandn %xmm6, %xmm0 7051; SSE-NEXT: pand %xmm14, %xmm5 7052; SSE-NEXT: por %xmm5, %xmm0 7053; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7054; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7055; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7056; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7057; SSE-NEXT: # xmm6 = mem[1,1,1,1] 7058; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 7059; SSE-NEXT: movdqa %xmm4, %xmm6 7060; SSE-NEXT: pandn %xmm5, %xmm6 7061; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7062; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7063; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] 7064; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] 7065; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] 7066; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] 7067; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] 7068; SSE-NEXT: pand %xmm4, %xmm7 7069; SSE-NEXT: por %xmm6, %xmm7 7070; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7071; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7072; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] 7073; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7074; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] 7075; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] 7076; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 7077; SSE-NEXT: movdqa %xmm14, %xmm0 7078; SSE-NEXT: pandn %xmm6, %xmm0 7079; SSE-NEXT: pand %xmm14, %xmm7 7080; SSE-NEXT: por %xmm7, %xmm0 7081; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7082; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 7083; SSE-NEXT: movdqa %xmm13, %xmm6 7084; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7085; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7086; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 7087; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 7088; SSE-NEXT: movdqa %xmm4, %xmm12 7089; SSE-NEXT: pandn %xmm6, %xmm12 7090; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7091; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7092; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] 7093; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] 7094; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] 7095; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] 7096; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] 7097; SSE-NEXT: pand %xmm4, %xmm6 7098; SSE-NEXT: por %xmm12, %xmm6 7099; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7100; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 7101; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2] 7102; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7103; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] 7104; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] 7105; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] 7106; SSE-NEXT: movdqa %xmm14, %xmm15 7107; SSE-NEXT: pandn %xmm12, %xmm15 7108; SSE-NEXT: pand %xmm14, %xmm6 7109; SSE-NEXT: por %xmm6, %xmm15 7110; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7111; SSE-NEXT: psrlq $48, %xmm9 7112; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 7113; SSE-NEXT: # xmm12 = mem[2,2,3,3] 7114; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0] 7115; SSE-NEXT: movdqa %xmm4, %xmm6 7116; SSE-NEXT: pandn %xmm12, %xmm6 7117; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] 7118; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 7119; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] 7120; SSE-NEXT: pand %xmm4, %xmm8 7121; SSE-NEXT: por %xmm6, %xmm8 7122; SSE-NEXT: pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload 7123; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] 7124; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] 7125; SSE-NEXT: movdqa %xmm14, %xmm12 7126; SSE-NEXT: pandn %xmm6, %xmm12 7127; SSE-NEXT: pand %xmm14, %xmm8 7128; SSE-NEXT: por %xmm8, %xmm12 7129; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill 7130; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 7131; SSE-NEXT: movdqa %xmm9, %xmm6 7132; SSE-NEXT: psrlq $48, %xmm6 7133; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 7134; SSE-NEXT: # xmm8 = mem[2,2,3,3] 7135; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] 7136; SSE-NEXT: movdqa %xmm4, %xmm6 7137; SSE-NEXT: pandn %xmm8, %xmm6 7138; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 7139; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7140; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] 7141; SSE-NEXT: pand %xmm4, %xmm1 7142; SSE-NEXT: por %xmm6, %xmm1 7143; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7144; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] 7145; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] 7146; SSE-NEXT: movdqa %xmm14, %xmm8 7147; SSE-NEXT: pandn %xmm6, %xmm8 7148; SSE-NEXT: pand %xmm14, %xmm1 7149; SSE-NEXT: por %xmm1, %xmm8 7150; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7151; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7152; SSE-NEXT: movdqa %xmm15, %xmm1 7153; SSE-NEXT: psrlq $48, %xmm1 7154; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7155; SSE-NEXT: # xmm6 = mem[2,2,3,3] 7156; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] 7157; SSE-NEXT: movdqa %xmm4, %xmm1 7158; SSE-NEXT: pandn %xmm6, %xmm1 7159; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] 7160; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] 7161; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 7162; SSE-NEXT: pand %xmm4, %xmm6 7163; SSE-NEXT: por %xmm1, %xmm6 7164; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7165; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7166; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7167; SSE-NEXT: movdqa %xmm14, %xmm8 7168; SSE-NEXT: pandn %xmm1, %xmm8 7169; SSE-NEXT: pand %xmm14, %xmm6 7170; SSE-NEXT: por %xmm6, %xmm8 7171; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7172; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7173; SSE-NEXT: movdqa %xmm12, %xmm1 7174; SSE-NEXT: psrlq $48, %xmm1 7175; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 7176; SSE-NEXT: # xmm6 = mem[2,2,3,3] 7177; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] 7178; SSE-NEXT: movdqa %xmm4, %xmm1 7179; SSE-NEXT: pandn %xmm6, %xmm1 7180; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 7181; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 7182; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 7183; SSE-NEXT: pand %xmm4, %xmm3 7184; SSE-NEXT: por %xmm1, %xmm3 7185; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7186; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7187; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7188; SSE-NEXT: movdqa %xmm14, %xmm6 7189; SSE-NEXT: pandn %xmm1, %xmm6 7190; SSE-NEXT: pand %xmm14, %xmm3 7191; SSE-NEXT: por %xmm3, %xmm6 7192; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7193; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7194; SSE-NEXT: movdqa %xmm8, %xmm1 7195; SSE-NEXT: psrlq $48, %xmm1 7196; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7197; SSE-NEXT: # xmm3 = mem[2,2,3,3] 7198; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 7199; SSE-NEXT: movdqa %xmm4, %xmm1 7200; SSE-NEXT: pandn %xmm3, %xmm1 7201; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] 7202; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 7203; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 7204; SSE-NEXT: pand %xmm4, %xmm3 7205; SSE-NEXT: por %xmm1, %xmm3 7206; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7207; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7208; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7209; SSE-NEXT: movdqa %xmm14, %xmm6 7210; SSE-NEXT: pandn %xmm1, %xmm6 7211; SSE-NEXT: pand %xmm14, %xmm3 7212; SSE-NEXT: por %xmm3, %xmm6 7213; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7214; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7215; SSE-NEXT: movdqa %xmm6, %xmm1 7216; SSE-NEXT: psrlq $48, %xmm1 7217; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7218; SSE-NEXT: # xmm3 = mem[2,2,3,3] 7219; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 7220; SSE-NEXT: movdqa %xmm4, %xmm1 7221; SSE-NEXT: pandn %xmm3, %xmm1 7222; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 7223; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 7224; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 7225; SSE-NEXT: pand %xmm4, %xmm2 7226; SSE-NEXT: por %xmm1, %xmm2 7227; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7228; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7229; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7230; SSE-NEXT: movdqa %xmm14, %xmm3 7231; SSE-NEXT: pandn %xmm1, %xmm3 7232; SSE-NEXT: pand %xmm14, %xmm2 7233; SSE-NEXT: por %xmm2, %xmm3 7234; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7235; SSE-NEXT: psrlq $48, %xmm0 7236; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] 7237; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 7238; SSE-NEXT: movdqa %xmm4, %xmm1 7239; SSE-NEXT: pandn %xmm2, %xmm1 7240; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] 7241; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 7242; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 7243; SSE-NEXT: pand %xmm4, %xmm2 7244; SSE-NEXT: por %xmm1, %xmm2 7245; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7246; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7247; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7248; SSE-NEXT: movdqa %xmm14, %xmm0 7249; SSE-NEXT: pandn %xmm1, %xmm0 7250; SSE-NEXT: pand %xmm14, %xmm2 7251; SSE-NEXT: por %xmm2, %xmm0 7252; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7253; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7254; SSE-NEXT: movdqa %xmm7, %xmm1 7255; SSE-NEXT: psrlq $48, %xmm1 7256; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7257; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] 7258; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 7259; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] 7260; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7261; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] 7262; SSE-NEXT: pand %xmm4, %xmm1 7263; SSE-NEXT: pandn %xmm2, %xmm4 7264; SSE-NEXT: por %xmm1, %xmm4 7265; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7266; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] 7267; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 7268; SSE-NEXT: movdqa %xmm14, %xmm0 7269; SSE-NEXT: pandn %xmm1, %xmm0 7270; SSE-NEXT: pand %xmm14, %xmm4 7271; SSE-NEXT: por %xmm4, %xmm0 7272; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7273; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7274; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7275; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7276; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 7277; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 7278; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 7279; SSE-NEXT: # xmm11 = mem[0,1,0,3] 7280; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6] 7281; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7282; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] 7283; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] 7284; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 7285; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7] 7286; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 7287; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7288; SSE-NEXT: movdqa %xmm14, %xmm4 7289; SSE-NEXT: pandn %xmm3, %xmm4 7290; SSE-NEXT: andps %xmm14, %xmm1 7291; SSE-NEXT: por %xmm1, %xmm4 7292; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7293; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7294; SSE-NEXT: # xmm1 = mem[1,1,1,1] 7295; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7296; SSE-NEXT: # xmm3 = mem[2,3,2,3] 7297; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 7298; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7299; SSE-NEXT: # xmm1 = mem[0,1,0,3] 7300; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] 7301; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] 7302; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] 7303; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7304; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] 7305; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 7306; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7307; SSE-NEXT: movdqa %xmm14, %xmm9 7308; SSE-NEXT: pandn %xmm3, %xmm9 7309; SSE-NEXT: andps %xmm14, %xmm4 7310; SSE-NEXT: por %xmm4, %xmm9 7311; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7312; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7313; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7314; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7315; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7316; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7317; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7318; SSE-NEXT: # xmm3 = mem[0,1,0,3] 7319; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7320; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7321; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] 7322; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7323; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7324; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7325; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7326; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7327; SSE-NEXT: movdqa %xmm14, %xmm9 7328; SSE-NEXT: pandn %xmm4, %xmm9 7329; SSE-NEXT: andps %xmm14, %xmm3 7330; SSE-NEXT: por %xmm3, %xmm9 7331; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7332; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7333; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7334; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7335; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7336; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7337; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7338; SSE-NEXT: # xmm3 = mem[0,1,0,3] 7339; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7340; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7341; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] 7342; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7343; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7344; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7345; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7346; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7347; SSE-NEXT: movdqa %xmm14, %xmm9 7348; SSE-NEXT: pandn %xmm4, %xmm9 7349; SSE-NEXT: andps %xmm14, %xmm3 7350; SSE-NEXT: por %xmm3, %xmm9 7351; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7352; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7353; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7354; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7355; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7356; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7357; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7358; SSE-NEXT: # xmm3 = mem[0,1,0,3] 7359; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7360; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7361; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] 7362; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7363; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7364; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7365; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7366; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7367; SSE-NEXT: movdqa %xmm14, %xmm12 7368; SSE-NEXT: pandn %xmm4, %xmm12 7369; SSE-NEXT: andps %xmm14, %xmm3 7370; SSE-NEXT: por %xmm3, %xmm12 7371; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7372; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7373; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7374; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7375; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7376; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7377; SSE-NEXT: # xmm3 = mem[0,1,0,3] 7378; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7379; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7380; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] 7381; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7382; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7383; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7384; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7385; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7386; SSE-NEXT: movdqa %xmm14, %xmm15 7387; SSE-NEXT: pandn %xmm4, %xmm15 7388; SSE-NEXT: andps %xmm14, %xmm3 7389; SSE-NEXT: por %xmm3, %xmm15 7390; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7391; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7392; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7393; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7394; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7395; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7396; SSE-NEXT: # xmm3 = mem[0,1,0,3] 7397; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7398; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7399; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7400; SSE-NEXT: # xmm3 = xmm3[1],mem[1] 7401; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7402; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7403; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7404; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7405; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7406; SSE-NEXT: movdqa %xmm14, %xmm8 7407; SSE-NEXT: pandn %xmm4, %xmm8 7408; SSE-NEXT: andps %xmm14, %xmm3 7409; SSE-NEXT: por %xmm3, %xmm8 7410; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 7411; SSE-NEXT: # xmm3 = mem[1,1,1,1] 7412; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7413; SSE-NEXT: # xmm4 = mem[2,3,2,3] 7414; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 7415; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] 7416; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7417; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 7418; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] 7419; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] 7420; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 7421; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] 7422; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 7423; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] 7424; SSE-NEXT: movdqa %xmm14, %xmm7 7425; SSE-NEXT: pandn %xmm4, %xmm7 7426; SSE-NEXT: andps %xmm14, %xmm3 7427; SSE-NEXT: por %xmm3, %xmm7 7428; SSE-NEXT: psrlq $48, %xmm0 7429; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7430; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7431; SSE-NEXT: movdqa %xmm2, %xmm3 7432; SSE-NEXT: psrld $16, %xmm5 7433; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] 7434; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] 7435; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 7436; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] 7437; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 7438; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] 7439; SSE-NEXT: movdqa %xmm14, %xmm6 7440; SSE-NEXT: pandn %xmm3, %xmm6 7441; SSE-NEXT: andps %xmm14, %xmm2 7442; SSE-NEXT: por %xmm2, %xmm6 7443; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7444; SSE-NEXT: psrlq $48, %xmm0 7445; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7446; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7447; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7448; SSE-NEXT: movdqa %xmm2, %xmm3 7449; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7450; SSE-NEXT: psrld $16, %xmm2 7451; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 7452; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 7453; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] 7454; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7455; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7456; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7457; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7458; SSE-NEXT: movdqa %xmm14, %xmm5 7459; SSE-NEXT: pandn %xmm2, %xmm5 7460; SSE-NEXT: andps %xmm14, %xmm1 7461; SSE-NEXT: por %xmm1, %xmm5 7462; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7463; SSE-NEXT: psrlq $48, %xmm1 7464; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7465; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7466; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 7467; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7468; SSE-NEXT: psrld $16, %xmm2 7469; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7470; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] 7471; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] 7472; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 7473; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7474; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7475; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7476; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7477; SSE-NEXT: movdqa %xmm14, %xmm9 7478; SSE-NEXT: pandn %xmm2, %xmm9 7479; SSE-NEXT: andps %xmm14, %xmm1 7480; SSE-NEXT: por %xmm1, %xmm9 7481; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7482; SSE-NEXT: psrlq $48, %xmm0 7483; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7484; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7485; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7486; SSE-NEXT: movdqa %xmm1, %xmm2 7487; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7488; SSE-NEXT: psrld $16, %xmm0 7489; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7490; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] 7491; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 7492; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 7493; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7494; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7495; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7496; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7497; SSE-NEXT: movdqa %xmm14, %xmm11 7498; SSE-NEXT: pandn %xmm2, %xmm11 7499; SSE-NEXT: andps %xmm14, %xmm1 7500; SSE-NEXT: por %xmm1, %xmm11 7501; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7502; SSE-NEXT: psrlq $48, %xmm0 7503; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7504; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7505; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7506; SSE-NEXT: movdqa %xmm1, %xmm2 7507; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7508; SSE-NEXT: psrld $16, %xmm3 7509; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7510; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] 7511; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] 7512; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 7513; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7514; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7515; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7516; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7517; SSE-NEXT: movdqa %xmm14, %xmm10 7518; SSE-NEXT: pandn %xmm2, %xmm10 7519; SSE-NEXT: andps %xmm14, %xmm1 7520; SSE-NEXT: por %xmm1, %xmm10 7521; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7522; SSE-NEXT: psrlq $48, %xmm0 7523; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7524; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7525; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7526; SSE-NEXT: movdqa %xmm1, %xmm2 7527; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7528; SSE-NEXT: psrld $16, %xmm3 7529; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7530; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] 7531; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] 7532; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 7533; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7534; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7535; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7536; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7537; SSE-NEXT: movdqa %xmm14, %xmm4 7538; SSE-NEXT: pandn %xmm2, %xmm4 7539; SSE-NEXT: andps %xmm14, %xmm1 7540; SSE-NEXT: por %xmm1, %xmm4 7541; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7542; SSE-NEXT: psrlq $48, %xmm1 7543; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7544; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7545; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 7546; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7547; SSE-NEXT: psrld $16, %xmm1 7548; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7549; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] 7550; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 7551; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 7552; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7553; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] 7554; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 7555; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] 7556; SSE-NEXT: movdqa %xmm14, %xmm3 7557; SSE-NEXT: pandn %xmm1, %xmm3 7558; SSE-NEXT: andps %xmm14, %xmm2 7559; SSE-NEXT: por %xmm2, %xmm3 7560; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7561; SSE-NEXT: psrlq $48, %xmm0 7562; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7563; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7564; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7565; SSE-NEXT: movdqa %xmm1, %xmm2 7566; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7567; SSE-NEXT: psrld $16, %xmm0 7568; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7569; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] 7570; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 7571; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 7572; SSE-NEXT: andps %xmm14, %xmm1 7573; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7574; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] 7575; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 7576; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] 7577; SSE-NEXT: pandn %xmm2, %xmm14 7578; SSE-NEXT: por %xmm1, %xmm14 7579; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7580; SSE-NEXT: movaps %xmm0, 96(%rsi) 7581; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7582; SSE-NEXT: movaps %xmm0, 32(%rsi) 7583; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7584; SSE-NEXT: movaps %xmm1, 112(%rsi) 7585; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7586; SSE-NEXT: movaps %xmm1, 48(%rsi) 7587; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7588; SSE-NEXT: movaps %xmm1, 64(%rsi) 7589; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7590; SSE-NEXT: movaps %xmm1, (%rsi) 7591; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7592; SSE-NEXT: movaps %xmm1, 80(%rsi) 7593; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7594; SSE-NEXT: movaps %xmm1, 16(%rsi) 7595; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7596; SSE-NEXT: movaps %xmm0, 96(%rdx) 7597; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7598; SSE-NEXT: movaps %xmm0, 32(%rdx) 7599; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7600; SSE-NEXT: movaps %xmm0, 112(%rdx) 7601; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7602; SSE-NEXT: movaps %xmm0, 48(%rdx) 7603; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7604; SSE-NEXT: movaps %xmm0, 64(%rdx) 7605; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7606; SSE-NEXT: movaps %xmm0, (%rdx) 7607; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7608; SSE-NEXT: movaps %xmm0, 80(%rdx) 7609; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7610; SSE-NEXT: movaps %xmm0, 16(%rdx) 7611; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7612; SSE-NEXT: movaps %xmm0, 96(%rcx) 7613; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7614; SSE-NEXT: movaps %xmm0, 112(%rcx) 7615; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7616; SSE-NEXT: movaps %xmm0, 64(%rcx) 7617; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7618; SSE-NEXT: movaps %xmm0, 80(%rcx) 7619; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7620; SSE-NEXT: movaps %xmm0, 32(%rcx) 7621; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7622; SSE-NEXT: movaps %xmm0, 48(%rcx) 7623; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7624; SSE-NEXT: movaps %xmm0, (%rcx) 7625; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7626; SSE-NEXT: movaps %xmm0, 16(%rcx) 7627; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7628; SSE-NEXT: movaps %xmm0, 112(%r8) 7629; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7630; SSE-NEXT: movaps %xmm0, 96(%r8) 7631; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7632; SSE-NEXT: movaps %xmm0, 80(%r8) 7633; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7634; SSE-NEXT: movaps %xmm0, 64(%r8) 7635; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7636; SSE-NEXT: movaps %xmm0, 48(%r8) 7637; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7638; SSE-NEXT: movaps %xmm0, 32(%r8) 7639; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7640; SSE-NEXT: movaps %xmm0, 16(%r8) 7641; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 7642; SSE-NEXT: movaps %xmm0, (%r8) 7643; SSE-NEXT: movdqa %xmm7, 112(%r9) 7644; SSE-NEXT: movdqa %xmm8, 96(%r9) 7645; SSE-NEXT: movdqa %xmm15, 80(%r9) 7646; SSE-NEXT: movdqa %xmm12, 64(%r9) 7647; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7648; SSE-NEXT: movaps %xmm0, 48(%r9) 7649; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7650; SSE-NEXT: movaps %xmm0, 32(%r9) 7651; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7652; SSE-NEXT: movaps %xmm0, 16(%r9) 7653; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7654; SSE-NEXT: movaps %xmm0, (%r9) 7655; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 7656; SSE-NEXT: movdqa %xmm14, 112(%rax) 7657; SSE-NEXT: movdqa %xmm3, 96(%rax) 7658; SSE-NEXT: movdqa %xmm4, 80(%rax) 7659; SSE-NEXT: movdqa %xmm10, 64(%rax) 7660; SSE-NEXT: movdqa %xmm11, 48(%rax) 7661; SSE-NEXT: movdqa %xmm9, 32(%rax) 7662; SSE-NEXT: movdqa %xmm5, 16(%rax) 7663; SSE-NEXT: movdqa %xmm6, (%rax) 7664; SSE-NEXT: addq $1176, %rsp # imm = 0x498 7665; SSE-NEXT: retq 7666; 7667; AVX-LABEL: load_i16_stride6_vf64: 7668; AVX: # %bb.0: 7669; AVX-NEXT: subq $1368, %rsp # imm = 0x558 7670; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 7671; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7672; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 7673; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7674; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 7675; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 7676; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7677; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 7678; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 7679; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7680; AVX-NEXT: vpslld $16, %xmm1, %xmm1 7681; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 7682; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7683; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7684; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7685; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 7686; AVX-NEXT: vmovdqa (%rdi), %xmm3 7687; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7688; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 7689; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7690; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 7691; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 7692; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7693; AVX-NEXT: vpsrlq $16, %xmm5, %xmm1 7694; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7695; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 7696; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7697; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] 7698; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 7699; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] 7700; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7701; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 7702; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 7703; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] 7704; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] 7705; AVX-NEXT: vmovdqa 176(%rdi), %xmm0 7706; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7707; AVX-NEXT: vpslld $16, %xmm0, %xmm0 7708; AVX-NEXT: vmovdqa 160(%rdi), %xmm2 7709; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7710; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7711; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7712; AVX-NEXT: vmovdqa 128(%rdi), %xmm2 7713; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7714; AVX-NEXT: vpsrlq $16, %xmm2, %xmm2 7715; AVX-NEXT: vmovdqa 144(%rdi), %xmm3 7716; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7717; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 7718; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7719; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] 7720; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7721; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] 7722; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 7723; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 7724; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7725; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7726; AVX-NEXT: vorps %ymm2, %ymm1, %ymm0 7727; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7728; AVX-NEXT: vmovdqa 464(%rdi), %xmm0 7729; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 7730; AVX-NEXT: vpslld $16, %xmm0, %xmm1 7731; AVX-NEXT: vmovdqa 448(%rdi), %xmm0 7732; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7733; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7734; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7735; AVX-NEXT: vmovdqa 480(%rdi), %xmm0 7736; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7737; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 7738; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7739; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 7740; AVX-NEXT: vmovdqa 496(%rdi), %xmm0 7741; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7742; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7743; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 7744; AVX-NEXT: vmovdqa 416(%rdi), %xmm0 7745; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7746; AVX-NEXT: vpsrlq $16, %xmm0, %xmm2 7747; AVX-NEXT: vmovdqa 432(%rdi), %xmm0 7748; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7749; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3] 7750; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] 7751; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7752; AVX-NEXT: vmovdqa 384(%rdi), %xmm0 7753; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7754; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] 7755; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] 7756; AVX-NEXT: vmovdqa 400(%rdi), %xmm0 7757; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7758; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 7759; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] 7760; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 7761; AVX-NEXT: vmovdqa 560(%rdi), %xmm0 7762; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7763; AVX-NEXT: vpslld $16, %xmm0, %xmm2 7764; AVX-NEXT: vmovdqa 544(%rdi), %xmm0 7765; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7766; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7767; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 7768; AVX-NEXT: vmovdqa 512(%rdi), %xmm0 7769; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7770; AVX-NEXT: vpsrlq $16, %xmm0, %xmm3 7771; AVX-NEXT: vmovdqa 528(%rdi), %xmm0 7772; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7773; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3] 7774; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] 7775; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 7776; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] 7777; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 7778; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7779; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7780; AVX-NEXT: vorps %ymm2, %ymm1, %ymm0 7781; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7782; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 7783; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7784; AVX-NEXT: vpslld $16, %xmm0, %xmm1 7785; AVX-NEXT: vmovdqa 256(%rdi), %xmm0 7786; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7787; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7788; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7789; AVX-NEXT: vmovdqa 288(%rdi), %xmm0 7790; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7791; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 7792; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7793; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 7794; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 7795; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7796; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7797; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 7798; AVX-NEXT: vmovdqa 224(%rdi), %xmm0 7799; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7800; AVX-NEXT: vpsrlq $16, %xmm0, %xmm2 7801; AVX-NEXT: vmovdqa 240(%rdi), %xmm0 7802; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7803; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3] 7804; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] 7805; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 7806; AVX-NEXT: vmovdqa 192(%rdi), %xmm0 7807; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7808; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] 7809; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] 7810; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 7811; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7812; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 7813; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] 7814; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 7815; AVX-NEXT: vmovdqa 368(%rdi), %xmm0 7816; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7817; AVX-NEXT: vpslld $16, %xmm0, %xmm2 7818; AVX-NEXT: vmovdqa 352(%rdi), %xmm0 7819; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7820; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7821; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 7822; AVX-NEXT: vmovdqa 320(%rdi), %xmm10 7823; AVX-NEXT: vpsrlq $16, %xmm10, %xmm3 7824; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7825; AVX-NEXT: vmovdqa 336(%rdi), %xmm0 7826; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7827; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] 7828; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] 7829; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 7830; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] 7831; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1 7832; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7833; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7834; AVX-NEXT: vorps %ymm2, %ymm1, %ymm0 7835; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7836; AVX-NEXT: vmovdqa 656(%rdi), %xmm0 7837; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7838; AVX-NEXT: vpslld $16, %xmm0, %xmm1 7839; AVX-NEXT: vmovdqa 640(%rdi), %xmm0 7840; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7841; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7842; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7843; AVX-NEXT: vmovdqa 672(%rdi), %xmm0 7844; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7845; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 7846; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7847; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] 7848; AVX-NEXT: vmovdqa 688(%rdi), %xmm0 7849; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7850; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 7851; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 7852; AVX-NEXT: vmovdqa 608(%rdi), %xmm0 7853; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7854; AVX-NEXT: vpsrlq $16, %xmm0, %xmm2 7855; AVX-NEXT: vmovdqa 624(%rdi), %xmm0 7856; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7857; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 7858; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7] 7859; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] 7860; AVX-NEXT: vmovdqa 576(%rdi), %xmm0 7861; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7862; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] 7863; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] 7864; AVX-NEXT: vmovdqa 592(%rdi), %xmm2 7865; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7866; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7867; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7] 7868; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7] 7869; AVX-NEXT: vmovdqa 752(%rdi), %xmm0 7870; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7871; AVX-NEXT: vpslld $16, %xmm0, %xmm1 7872; AVX-NEXT: vmovdqa 736(%rdi), %xmm0 7873; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7874; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7875; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] 7876; AVX-NEXT: vmovdqa 704(%rdi), %xmm0 7877; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7878; AVX-NEXT: vpsrlq $16, %xmm0, %xmm0 7879; AVX-NEXT: vmovdqa 720(%rdi), %xmm1 7880; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7881; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7882; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7] 7883; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 7884; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7] 7885; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 7886; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 7887; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0 7888; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 7889; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7890; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7891; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 7892; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7893; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 7894; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7895; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7896; AVX-NEXT: # xmm2 = mem[2,2,3,3] 7897; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 7898; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 7899; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 7900; AVX-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7901; AVX-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] 7902; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] 7903; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] 7904; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7905; AVX-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] 7906; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7907; AVX-NEXT: vpsrld $16, %xmm5, %xmm15 7908; AVX-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] 7909; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7] 7910; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7911; AVX-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7912; AVX-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] 7913; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7914; AVX-NEXT: # xmm14 = mem[1,1,1,1] 7915; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] 7916; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 7917; AVX-NEXT: # xmm14 = mem[2,2,3,3] 7918; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload 7919; AVX-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] 7920; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7] 7921; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 7922; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7923; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7924; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 7925; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7926; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7927; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 7928; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7929; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 7930; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7931; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7932; AVX-NEXT: # xmm2 = mem[2,2,3,3] 7933; AVX-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload 7934; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 7935; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 7936; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] 7937; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 7938; AVX-NEXT: # xmm13 = mem[1,1,1,1] 7939; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] 7940; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] 7941; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7942; AVX-NEXT: vpsrld $16, %xmm5, %xmm13 7943; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] 7944; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7] 7945; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7946; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] 7947; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 7948; AVX-NEXT: # xmm11 = mem[1,1,1,1] 7949; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] 7950; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 7951; AVX-NEXT: # xmm11 = mem[2,2,3,3] 7952; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload 7953; AVX-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] 7954; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7] 7955; AVX-NEXT: vmovaps %ymm6, %ymm13 7956; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 7957; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7958; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7959; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 7960; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7961; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7962; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 7963; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7964; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 7965; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7966; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7967; AVX-NEXT: # xmm2 = mem[2,2,3,3] 7968; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 7969; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 7970; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 7971; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7] 7972; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 7973; AVX-NEXT: # xmm9 = mem[1,1,1,1] 7974; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] 7975; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 7976; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7977; AVX-NEXT: vpsrld $16, %xmm5, %xmm9 7978; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] 7979; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7] 7980; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 7981; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] 7982; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] 7983; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] 7984; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 7985; AVX-NEXT: # xmm7 = mem[2,2,3,3] 7986; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 7987; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] 7988; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] 7989; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 7990; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 7991; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 7992; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 7993; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7994; AVX-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7995; AVX-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] 7996; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7997; AVX-NEXT: vpsrld $16, %xmm12, %xmm2 7998; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 7999; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8000; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] 8001; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8002; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] 8003; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 8004; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] 8005; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8006; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] 8007; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 8008; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] 8009; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 8010; AVX-NEXT: vpsrld $16, %xmm5, %xmm4 8011; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 8012; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] 8013; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 8014; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 8015; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8016; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] 8017; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 8018; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8019; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] 8020; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8021; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 8022; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 8023; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 8024; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8025; AVX-NEXT: vandnps %ymm1, %ymm13, %ymm1 8026; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 8027; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8028; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8029; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8030; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8031; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8032; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 8033; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] 8034; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8035; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8036; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] 8037; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8038; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 8039; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 8040; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8041; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8042; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] 8043; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8044; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 8045; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] 8046; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8047; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 8048; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] 8049; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8050; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 8051; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 8052; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 8053; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8054; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2 8055; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 8056; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8057; AVX-NEXT: # xmm2 = mem[1,1,1,1] 8058; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8059; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8060; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 8061; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8062; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 8063; AVX-NEXT: # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] 8064; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8065; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 8066; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 8067; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 8068; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 8069; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 8070; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 8071; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8072; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8073; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8074; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8075; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8076; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 8077; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8078; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 8079; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 8080; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8081; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 8082; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 8083; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8084; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload 8085; AVX-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 8086; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8087; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm2 8088; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8089; AVX-NEXT: vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload 8090; AVX-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] 8091; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8092; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm3 8093; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 8094; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm0 8095; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2 8096; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 8097; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8098; AVX-NEXT: # xmm2 = mem[1,1,1,1] 8099; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8100; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8101; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 8102; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8103; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload 8104; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7] 8105; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8106; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm3 8107; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 8108; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 8109; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 8110; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 8111; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 8112; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8113; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8114; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8115; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8116; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 8117; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload 8118; AVX-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] 8119; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8120; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 8121; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 8122; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload 8123; AVX-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] 8124; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8125; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 8126; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] 8127; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8128; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 8129; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 8130; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8131; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2 8132; AVX-NEXT: vorps %ymm0, %ymm2, %ymm2 8133; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8134; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8135; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8136; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] 8137; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] 8138; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8139; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm4 8140; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] 8141; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 8142; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 8143; AVX-NEXT: vandnps %ymm3, %ymm13, %ymm3 8144; AVX-NEXT: vorps %ymm3, %ymm2, %ymm0 8145; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8146; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8147; AVX-NEXT: # xmm2 = mem[1,1,1,1] 8148; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8149; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8150; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 8151; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8152; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 8153; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 8154; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8155; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm3 8156; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 8157; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8158; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 8159; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] 8160; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm0 8161; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8162; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload 8163; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7] 8164; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm6 8165; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 8166; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2 8167; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0 8168; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 8169; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8170; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] 8171; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 8172; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 8173; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] 8174; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8175; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 8176; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] 8177; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm14 8178; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] 8179; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 8180; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 8181; AVX-NEXT: vandnps %ymm6, %ymm13, %ymm6 8182; AVX-NEXT: vmovaps %ymm13, %ymm5 8183; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0 8184; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8185; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8186; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 8187; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 8188; AVX-NEXT: # xmm6 = mem[2,2,3,3] 8189; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] 8190; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] 8191; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8192; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm6 8193; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] 8194; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8195; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm7 8196; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] 8197; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8198; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 8199; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 8200; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8201; AVX-NEXT: vandps %ymm7, %ymm9, %ymm7 8202; AVX-NEXT: vorps %ymm0, %ymm7, %ymm0 8203; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8204; AVX-NEXT: vpsrlq $48, %xmm7, %xmm7 8205; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 8206; AVX-NEXT: # xmm8 = mem[2,2,3,3] 8207; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] 8208; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8209; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 8210; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] 8211; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 8212; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 8213; AVX-NEXT: vandnps %ymm7, %ymm13, %ymm7 8214; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 8215; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8216; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 8217; AVX-NEXT: vpsrlq $48, %xmm13, %xmm0 8218; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8219; AVX-NEXT: # xmm7 = mem[2,2,3,3] 8220; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] 8221; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8222; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm7 8223; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] 8224; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8225; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm7 8226; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8227; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm8 8228; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 8229; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8230; AVX-NEXT: vandps %ymm7, %ymm9, %ymm7 8231; AVX-NEXT: vorps %ymm0, %ymm7, %ymm0 8232; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8233; AVX-NEXT: vpsrlq $48, %xmm10, %xmm7 8234; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 8235; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] 8236; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] 8237; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8238; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm8 8239; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] 8240; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 8241; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 8242; AVX-NEXT: vandnps %ymm7, %ymm5, %ymm7 8243; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 8244; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8245; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8246; AVX-NEXT: vpsrlq $48, %xmm8, %xmm0 8247; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] 8248; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] 8249; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8250; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm4 8251; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] 8252; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 8253; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm4 8254; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 8255; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8256; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 8257; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 8258; AVX-NEXT: vpsrlq $48, %xmm11, %xmm3 8259; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8260; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] 8261; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 8262; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 8263; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 8264; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 8265; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 8266; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 8267; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 8268; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8269; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8270; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 8271; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8272; AVX-NEXT: # xmm2 = mem[2,2,3,3] 8273; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 8274; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8275; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 8276; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 8277; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8278; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 8279; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8280; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 8281; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 8282; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 8283; AVX-NEXT: vandps %ymm2, %ymm9, %ymm1 8284; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 8285; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8286; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 8287; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8288; AVX-NEXT: # xmm2 = mem[2,2,3,3] 8289; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 8290; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8291; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 8292; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 8293; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 8294; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 8295; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 8296; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 8297; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8298; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8299; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8300; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8301; AVX-NEXT: # xmm1 = mem[2,3,2,3] 8302; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 8303; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8304; AVX-NEXT: vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload 8305; AVX-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] 8306; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 8307; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] 8308; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 8309; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 8310; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8311; AVX-NEXT: # xmm2 = mem[1,1,1,1] 8312; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8313; AVX-NEXT: # xmm3 = mem[2,3,2,3] 8314; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 8315; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8316; AVX-NEXT: # xmm3 = mem[0,1,0,3] 8317; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8318; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] 8319; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1] 8320; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] 8321; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 8322; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 8323; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 8324; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 8325; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8326; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload 8327; AVX-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 8328; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8329; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] 8330; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8331; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] 8332; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] 8333; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm3 8334; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 8335; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 8336; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 8337; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8338; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8339; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8340; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8341; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] 8342; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 8343; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8344; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload 8345; AVX-NEXT: # xmm13 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 8346; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm2 8347; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 8348; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 8349; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] 8350; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8351; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] 8352; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 8353; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8354; AVX-NEXT: # xmm3 = mem[0,1,0,3] 8355; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6] 8356; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload 8357; AVX-NEXT: # xmm6 = xmm6[1],mem[1] 8358; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] 8359; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 8360; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 8361; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 8362; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8363; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload 8364; AVX-NEXT: # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7] 8365; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8366; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 8367; AVX-NEXT: # xmm6 = mem[0,1,0,3] 8368; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] 8369; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8370; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] 8371; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm10 8372; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] 8373; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 8374; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] 8375; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8376; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8377; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8378; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8379; AVX-NEXT: # xmm7 = mem[2,3,2,3] 8380; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] 8381; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8382; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 8383; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] 8384; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8385; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm7 8386; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 8387; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8388; AVX-NEXT: # xmm7 = mem[1,1,1,1] 8389; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 8390; AVX-NEXT: # xmm10 = mem[2,3,2,3] 8391; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] 8392; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8393; AVX-NEXT: # xmm9 = mem[0,1,0,3] 8394; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8395; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6] 8396; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] 8397; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] 8398; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 8399; AVX-NEXT: vandps %ymm5, %ymm7, %ymm7 8400; AVX-NEXT: vorps %ymm0, %ymm7, %ymm0 8401; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8402; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload 8403; AVX-NEXT: # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] 8404; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8405; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] 8406; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8407; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] 8408; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 8409; AVX-NEXT: # xmm7 = xmm7[1],mem[1] 8410; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm14 8411; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] 8412; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 8413; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] 8414; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8415; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8416; AVX-NEXT: # xmm0 = mem[1,1,1,1] 8417; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 8418; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] 8419; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] 8420; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8421; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 8422; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] 8423; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8424; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm7 8425; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 8426; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8427; AVX-NEXT: # xmm7 = mem[1,1,1,1] 8428; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8429; AVX-NEXT: # xmm15 = mem[2,3,2,3] 8430; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] 8431; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8432; AVX-NEXT: # xmm15 = mem[0,1,0,3] 8433; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,4,6] 8434; AVX-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload 8435; AVX-NEXT: # xmm9 = xmm9[1],mem[1] 8436; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] 8437; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 8438; AVX-NEXT: vandps %ymm5, %ymm7, %ymm7 8439; AVX-NEXT: vorps %ymm0, %ymm7, %ymm0 8440; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8441; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload 8442; AVX-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] 8443; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm9 8444; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 8445; AVX-NEXT: # xmm14 = mem[0,1,0,3] 8446; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,4,6] 8447; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8448; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] 8449; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] 8450; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 8451; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] 8452; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8453; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8454; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 8455; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8456; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] 8457; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] 8458; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm4 8459; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 8460; AVX-NEXT: vpsrlq $48, %xmm12, %xmm8 8461; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8462; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 8463; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8464; AVX-NEXT: vpsrld $16, %xmm9, %xmm9 8465; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] 8466; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] 8467; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] 8468; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 8469; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 8470; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 8471; AVX-NEXT: vpsrld $16, %xmm2, %xmm4 8472; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] 8473; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] 8474; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8475; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 8476; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] 8477; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 8478; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5,6,7] 8479; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8480; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2 8481; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8482; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8483; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 8484; AVX-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload 8485; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 8486; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 8487; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8488; AVX-NEXT: vpsrlq $48, %xmm4, %xmm4 8489; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8490; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8491; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 8492; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8493; AVX-NEXT: vpsrld $16, %xmm6, %xmm6 8494; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 8495; AVX-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] 8496; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] 8497; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7] 8498; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 8499; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4 8500; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 8501; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8502; AVX-NEXT: vpsrld $16, %xmm4, %xmm4 8503; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 8504; AVX-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] 8505; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] 8506; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8507; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 8508; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] 8509; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 8510; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 8511; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8512; AVX-NEXT: vpsrlq $48, %xmm4, %xmm4 8513; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8514; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8515; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 8516; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8517; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 8518; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 8519; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8520; AVX-NEXT: vpsrlq $48, %xmm6, %xmm6 8521; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8522; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8523; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 8524; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8525; AVX-NEXT: vpsrld $16, %xmm8, %xmm8 8526; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8527; AVX-NEXT: # xmm9 = mem[0,1,2,3,4,5,5,7] 8528; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] 8529; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7] 8530; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 8531; AVX-NEXT: vandps %ymm5, %ymm6, %ymm6 8532; AVX-NEXT: vorps %ymm4, %ymm6, %ymm4 8533; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8534; AVX-NEXT: vpsrld $16, %xmm6, %xmm6 8535; AVX-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 8536; AVX-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] 8537; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] 8538; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8539; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm8 8540; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] 8541; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 8542; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] 8543; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8544; AVX-NEXT: vpsrlq $48, %xmm6, %xmm6 8545; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8546; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 8547; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8548; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm8 8549; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 8550; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8551; AVX-NEXT: vpsrlq $48, %xmm8, %xmm8 8552; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8553; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8554; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 8555; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8556; AVX-NEXT: vpsrld $16, %xmm9, %xmm9 8557; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,5,7] 8558; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] 8559; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] 8560; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6 8561; AVX-NEXT: vandps %ymm5, %ymm8, %ymm5 8562; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5 8563; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0 8564; AVX-NEXT: vpsrld $16, %xmm1, %xmm6 8565; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] 8566; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] 8567; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] 8568; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 8569; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] 8570; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8571; AVX-NEXT: vmovaps %ymm1, 96(%rsi) 8572; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8573; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 8574; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8575; AVX-NEXT: vmovaps %ymm1, 64(%rsi) 8576; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8577; AVX-NEXT: vmovaps %ymm1, (%rsi) 8578; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8579; AVX-NEXT: vmovaps %ymm1, 96(%rdx) 8580; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8581; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 8582; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8583; AVX-NEXT: vmovaps %ymm1, 64(%rdx) 8584; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8585; AVX-NEXT: vmovaps %ymm1, (%rdx) 8586; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8587; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 8588; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8589; AVX-NEXT: vmovaps %ymm1, 96(%rcx) 8590; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8591; AVX-NEXT: vmovaps %ymm1, 64(%rcx) 8592; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8593; AVX-NEXT: vmovaps %ymm1, (%rcx) 8594; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8595; AVX-NEXT: vmovaps %ymm1, 96(%r8) 8596; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8597; AVX-NEXT: vmovaps %ymm1, 32(%r8) 8598; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8599; AVX-NEXT: vmovaps %ymm1, 64(%r8) 8600; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8601; AVX-NEXT: vmovaps %ymm1, (%r8) 8602; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8603; AVX-NEXT: vmovaps %ymm1, 96(%r9) 8604; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8605; AVX-NEXT: vmovaps %ymm1, 32(%r9) 8606; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8607; AVX-NEXT: vmovaps %ymm1, (%r9) 8608; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8609; AVX-NEXT: vmovaps %ymm1, 64(%r9) 8610; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 8611; AVX-NEXT: vmovaps %ymm0, 96(%rax) 8612; AVX-NEXT: vmovaps %ymm4, 32(%rax) 8613; AVX-NEXT: vmovaps %ymm2, 64(%rax) 8614; AVX-NEXT: vmovaps %ymm3, (%rax) 8615; AVX-NEXT: addq $1368, %rsp # imm = 0x558 8616; AVX-NEXT: vzeroupper 8617; AVX-NEXT: retq 8618; 8619; AVX2-LABEL: load_i16_stride6_vf64: 8620; AVX2: # %bb.0: 8621; AVX2-NEXT: subq $1272, %rsp # imm = 0x4F8 8622; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 8623; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 8624; AVX2-NEXT: vmovaps 672(%rdi), %ymm2 8625; AVX2-NEXT: vmovaps 640(%rdi), %ymm3 8626; AVX2-NEXT: vmovdqa 288(%rdi), %ymm4 8627; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5 8628; AVX2-NEXT: vmovdqa 416(%rdi), %ymm8 8629; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8630; AVX2-NEXT: vmovdqa 384(%rdi), %ymm9 8631; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8632; AVX2-NEXT: vmovdqa 480(%rdi), %ymm6 8633; AVX2-NEXT: vmovdqa 448(%rdi), %ymm7 8634; AVX2-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] 8635; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8636; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] 8637; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8638; AVX2-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] 8639; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8640; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] 8641; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] 8642; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8643; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] 8644; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8645; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 8646; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8647; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] 8648; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8649; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 8650; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] 8651; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 8652; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0 8653; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 8654; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] 8655; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] 8656; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] 8657; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm7 8658; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 8659; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 8660; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8661; AVX2-NEXT: vmovdqa (%rdi), %ymm3 8662; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8663; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 8664; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8665; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] 8666; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3 8667; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm11 8668; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7] 8669; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] 8670; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 8671; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm7 8672; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 8673; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8674; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 8675; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8676; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 8677; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8678; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] 8679; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 8680; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8 8681; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] 8682; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7] 8683; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8684; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] 8685; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm13 8686; AVX2-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 8687; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8688; AVX2-NEXT: vmovdqa 608(%rdi), %ymm13 8689; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8690; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12 8691; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8692; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] 8693; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm13 8694; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm6 8695; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] 8696; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] 8697; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 8698; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 8699; AVX2-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] 8700; AVX2-NEXT: vpshufb %ymm2, %ymm13, %ymm2 8701; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 8702; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8703; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 8704; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm9 8705; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] 8706; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 8707; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] 8708; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 8709; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10 8710; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 8711; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8712; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 8713; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] 8714; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 8715; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] 8716; AVX2-NEXT: vpshufb %ymm9, %ymm4, %ymm4 8717; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 8718; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8719; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1 8720; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] 8721; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 8722; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] 8723; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm3 8724; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 8725; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8726; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm1 8727; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2 8728; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] 8729; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 8730; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 8731; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 8732; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8733; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8734; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 8735; AVX2-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 8736; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm0 8737; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8738; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] 8739; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 8740; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 8741; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm1 8742; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 8743; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 8744; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8745; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8746; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] 8747; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8748; AVX2-NEXT: vpshufb %ymm13, %ymm1, %ymm1 8749; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 8750; AVX2-NEXT: vmovdqa 544(%rdi), %ymm1 8751; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8752; AVX2-NEXT: vmovdqa 512(%rdi), %ymm2 8753; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8754; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 8755; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] 8756; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm2 8757; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8758; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 8759; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 8760; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 8761; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8762; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] 8763; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 8764; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8765; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8766; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 8767; AVX2-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 8768; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 8769; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8770; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] 8771; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 8772; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm2 8773; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 8774; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload 8775; AVX2-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] 8776; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8777; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 8778; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 8779; AVX2-NEXT: vmovdqa 352(%rdi), %ymm2 8780; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8781; AVX2-NEXT: vmovdqa 320(%rdi), %ymm3 8782; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8783; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 8784; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] 8785; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm3 8786; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8787; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 8788; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 8789; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8790; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] 8791; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 8792; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8793; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8794; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 8795; AVX2-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 8796; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm14 8797; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] 8798; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 8799; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm2 8800; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 8801; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8802; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 8803; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] 8804; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8805; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 8806; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 8807; AVX2-NEXT: vmovdqa 736(%rdi), %ymm8 8808; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8809; AVX2-NEXT: vmovdqa 704(%rdi), %ymm2 8810; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8811; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7] 8812; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,2,2,2,4,5,6,7] 8813; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm12 8814; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3,4],xmm12[5,6,7] 8815; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm8 8816; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8817; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] 8818; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] 8819; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8820; AVX2-NEXT: vmovdqa 160(%rdi), %ymm8 8821; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8822; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 8823; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8824; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] 8825; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] 8826; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 8827; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7] 8828; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm9 8829; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8830; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8831; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] 8832; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm8 8833; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm10 8834; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,2,0,3] 8835; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] 8836; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm15[2],xmm8[3],xmm15[4,5],xmm8[6,7] 8837; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8838; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 8839; AVX2-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] 8840; AVX2-NEXT: vpshufb %ymm13, %ymm8, %ymm13 8841; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] 8842; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 8843; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15] 8844; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] 8845; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8846; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 8847; AVX2-NEXT: # xmm9 = mem[2,1,0,3] 8848; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] 8849; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] 8850; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 8851; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3,4],xmm9[5],xmm5[6],xmm9[7] 8852; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 8853; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 8854; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm13 8855; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] 8856; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] 8857; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7] 8858; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 8859; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8860; AVX2-NEXT: vpshufb %xmm5, %xmm15, %xmm15 8861; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 8862; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3],xmm15[4,5],xmm7[6],xmm15[7] 8863; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 8864; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15] 8865; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 8866; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8867; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8868; AVX2-NEXT: # xmm7 = mem[2,1,0,3] 8869; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] 8870; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] 8871; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] 8872; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6],xmm7[7] 8873; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8874; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm7 8875; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] 8876; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] 8877; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] 8878; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8879; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7 8880; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 8881; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] 8882; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 8883; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] 8884; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] 8885; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8886; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,1,0,3] 8887; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 8888; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 8889; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 8890; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6],xmm4[7] 8891; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8892; AVX2-NEXT: vpshufb %ymm9, %ymm4, %ymm4 8893; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] 8894; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] 8895; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 8896; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm4 8897; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 8898; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] 8899; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8900; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] 8901; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 8902; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8903; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm2 8904; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 8905; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] 8906; AVX2-NEXT: vpshufb %ymm9, %ymm8, %ymm2 8907; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,1,0,3] 8908; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] 8909; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 8910; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 8911; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6],xmm3[7] 8912; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7] 8913; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] 8914; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 8915; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8916; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 8917; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 8918; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8919; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8920; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 8921; AVX2-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 8922; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8923; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8924; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8925; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 8926; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 8927; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 8928; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8929; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 8930; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8931; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 8932; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 8933; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] 8934; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 8935; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8936; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8937; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 8938; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] 8939; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 8940; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] 8941; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7] 8942; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 8943; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] 8944; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 8945; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8946; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 8947; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2 8948; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 8949; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 8950; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 8951; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 8952; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8953; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8954; AVX2-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 8955; AVX2-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] 8956; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8957; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8958; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8959; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 8960; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 8961; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 8962; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8963; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 8964; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8965; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 8966; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 8967; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] 8968; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 8969; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8970; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 8971; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 8972; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] 8973; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8974; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 8975; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 8976; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8977; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 8978; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 8979; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] 8980; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 8981; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8982; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2 8983; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 8984; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 8985; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 8986; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 8987; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8988; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8989; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 8990; AVX2-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 8991; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8992; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8993; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 8994; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 8995; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] 8996; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] 8997; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] 8998; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 8999; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4] 9000; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 9001; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9002; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9003; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 9004; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] 9005; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 9006; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] 9007; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] 9008; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] 9009; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] 9010; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] 9011; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 9012; AVX2-NEXT: vpshufb %ymm8, %ymm9, %ymm13 9013; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] 9014; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] 9015; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] 9016; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] 9017; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9018; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9019; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 9020; AVX2-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9021; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 9022; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] 9023; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3] 9024; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7] 9025; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] 9026; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4] 9027; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7] 9028; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9029; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload 9030; AVX2-NEXT: # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] 9031; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9032; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 9033; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] 9034; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3] 9035; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 9036; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 9037; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] 9038; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] 9039; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7] 9040; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] 9041; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9042; AVX2-NEXT: vpshufb %ymm8, %ymm15, %ymm8 9043; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] 9044; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] 9045; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] 9046; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] 9047; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9048; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9049; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] 9050; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9051; AVX2-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7] 9052; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] 9053; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] 9054; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 9055; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 9056; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 9057; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] 9058; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 9059; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 9060; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm8 9061; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9062; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] 9063; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] 9064; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] 9065; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] 9066; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9067; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5] 9068; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] 9069; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 9070; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7] 9071; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] 9072; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] 9073; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 9074; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 9075; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm2 9076; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9077; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 9078; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 9079; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 9080; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9081; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9082; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] 9083; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9084; AVX2-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] 9085; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] 9086; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] 9087; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9088; AVX2-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] 9089; AVX2-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9090; AVX2-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] 9091; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 9092; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] 9093; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9094; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 9095; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9096; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 9097; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 9098; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 9099; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9100; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5] 9101; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7] 9102; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] 9103; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] 9104; AVX2-NEXT: vpshufb %ymm7, %ymm15, %ymm1 9105; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,1,4,5,6,7] 9106; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7] 9107; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 9108; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] 9109; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9110; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 9111; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] 9112; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] 9113; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9114; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9115; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9116; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9117; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 9118; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] 9119; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] 9120; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 9121; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 9122; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm3 9123; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] 9124; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9125; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9126; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] 9127; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9128; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9129; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9130; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 9131; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1] 9132; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] 9133; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 9134; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm4 9135; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] 9136; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9137; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 9138; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] 9139; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9140; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9141; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9142; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm13 9143; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1] 9144; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] 9145; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 9146; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm10 9147; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7] 9148; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9149; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9150; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] 9151; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9152; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 9153; AVX2-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9154; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm15 9155; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9 9156; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] 9157; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] 9158; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] 9159; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7] 9160; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 9161; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload 9162; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7] 9163; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 9164; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm11 9165; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] 9166; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] 9167; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] 9168; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 9169; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 9170; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] 9171; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 9172; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] 9173; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] 9174; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7] 9175; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9176; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 9177; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 9178; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm7 9179; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] 9180; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] 9181; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] 9182; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 9183; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload 9184; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] 9185; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9 9186; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 9187; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 9188; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7] 9189; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9190; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9191; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] 9192; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9193; AVX2-NEXT: vmovaps %ymm9, 96(%rsi) 9194; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9195; AVX2-NEXT: vmovaps %ymm9, 32(%rsi) 9196; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9197; AVX2-NEXT: vmovaps %ymm9, 64(%rsi) 9198; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9199; AVX2-NEXT: vmovaps %ymm9, (%rsi) 9200; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9201; AVX2-NEXT: vmovaps %ymm9, 96(%rdx) 9202; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9203; AVX2-NEXT: vmovaps %ymm9, 32(%rdx) 9204; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9205; AVX2-NEXT: vmovaps %ymm9, 64(%rdx) 9206; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9207; AVX2-NEXT: vmovaps %ymm9, (%rdx) 9208; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9209; AVX2-NEXT: vmovaps %ymm9, 32(%rcx) 9210; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9211; AVX2-NEXT: vmovaps %ymm9, 96(%rcx) 9212; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9213; AVX2-NEXT: vmovaps %ymm9, 64(%rcx) 9214; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 9215; AVX2-NEXT: vmovaps %ymm9, (%rcx) 9216; AVX2-NEXT: vmovdqa %ymm6, 96(%r8) 9217; AVX2-NEXT: vmovdqa %ymm8, 32(%r8) 9218; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 9219; AVX2-NEXT: vmovaps %ymm6, 64(%r8) 9220; AVX2-NEXT: vmovdqa %ymm5, (%r8) 9221; AVX2-NEXT: vmovdqa %ymm10, 96(%r9) 9222; AVX2-NEXT: vmovdqa %ymm2, 32(%r9) 9223; AVX2-NEXT: vmovdqa %ymm4, (%r9) 9224; AVX2-NEXT: vmovdqa %ymm3, 64(%r9) 9225; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 9226; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) 9227; AVX2-NEXT: vmovdqa %ymm7, 32(%rax) 9228; AVX2-NEXT: vmovdqa %ymm1, 64(%rax) 9229; AVX2-NEXT: vmovdqa %ymm11, (%rax) 9230; AVX2-NEXT: addq $1272, %rsp # imm = 0x4F8 9231; AVX2-NEXT: vzeroupper 9232; AVX2-NEXT: retq 9233; 9234; AVX2-FP-LABEL: load_i16_stride6_vf64: 9235; AVX2-FP: # %bb.0: 9236; AVX2-FP-NEXT: subq $1304, %rsp # imm = 0x518 9237; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 9238; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 9239; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm2 9240; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm3 9241; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm4 9242; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 9243; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm8 9244; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9245; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm9 9246; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9247; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6 9248; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm7 9249; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] 9250; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill 9251; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] 9252; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9253; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] 9254; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9255; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] 9256; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] 9257; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9258; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] 9259; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9260; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 9261; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9262; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] 9263; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9264; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 9265; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] 9266; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 9267; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm0 9268; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm5 9269; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] 9270; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] 9271; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] 9272; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 9273; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 9274; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 9275; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9276; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 9277; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9278; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 9279; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9280; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] 9281; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm6 9282; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm11 9283; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] 9284; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] 9285; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 9286; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 9287; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 9288; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9289; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8 9290; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9291; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm6 9292; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9293; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] 9294; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 9295; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm9 9296; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] 9297; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] 9298; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9299; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] 9300; AVX2-FP-NEXT: vpshufb %ymm2, %ymm8, %ymm13 9301; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 9302; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9303; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm13 9304; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9305; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12 9306; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9307; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] 9308; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm13 9309; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm3 9310; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] 9311; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] 9312; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9313; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 9314; AVX2-FP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] 9315; AVX2-FP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 9316; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 9317; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9318; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 9319; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 9320; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 9321; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] 9322; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 9323; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 9324; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 9325; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9326; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 9327; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 9328; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] 9329; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 9330; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 9331; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9332; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 9333; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 9334; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] 9335; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 9336; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 9337; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9338; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 9339; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 9340; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 9341; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 9342; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 9343; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9344; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9345; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 9346; AVX2-FP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9347; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm0 9348; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] 9349; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9350; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 9351; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 9352; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 9353; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] 9354; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 9355; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 9356; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9357; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] 9358; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9359; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 9360; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9361; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0 9362; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9363; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm3 9364; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9365; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 9366; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 9367; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm3 9368; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm4 9369; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9370; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 9371; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] 9372; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 9373; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] 9374; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 9375; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9376; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9377; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 9378; AVX2-FP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 9379; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2 9380; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] 9381; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9382; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 9383; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 9384; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 9385; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload 9386; AVX2-FP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] 9387; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9388; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 9389; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm15 9390; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 9391; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3 9392; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9393; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4 9394; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9395; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 9396; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 9397; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm4 9398; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9399; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 9400; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] 9401; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 9402; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] 9403; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 9404; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9405; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9406; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 9407; AVX2-FP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 9408; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 9409; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] 9410; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9411; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 9412; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 9413; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 9414; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9415; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9416; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] 9417; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9418; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 9419; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 9420; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm3 9421; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9422; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm4 9423; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9424; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 9425; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm9 9426; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm3 9427; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9428; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm10 9429; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] 9430; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 9431; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] 9432; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 9433; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9434; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 9435; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9436; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 9437; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9438; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 9439; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm9 9440; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13 9441; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 9442; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] 9443; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9444; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9445; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 9446; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm10 9447; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] 9448; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm12 9449; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 9450; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7] 9451; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9452; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 9453; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 9454; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm15 9455; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] 9456; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 9457; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] 9458; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] 9459; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9460; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] 9461; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 9462; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 9463; AVX2-FP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] 9464; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] 9465; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 9466; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9467; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 9468; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] 9469; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 9470; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] 9471; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 9472; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9473; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 9474; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 9475; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7] 9476; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9477; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 9478; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 9479; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9480; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 9481; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9482; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] 9483; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] 9484; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9485; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 9486; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 9487; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 9488; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9489; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 9490; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] 9491; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] 9492; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9493; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 9494; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 9495; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9496; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm0 9497; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9498; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] 9499; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] 9500; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9501; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 9502; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 9503; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 9504; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9505; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 9506; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 9507; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] 9508; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9509; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 9510; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 9511; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9512; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 9513; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] 9514; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 9515; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 9516; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 9517; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] 9518; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] 9519; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 9520; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 9521; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9522; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 9523; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9524; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9525; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 9526; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9527; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 9528; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9529; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9530; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9531; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9532; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] 9533; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 9534; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 9535; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] 9536; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9537; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 9538; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 9539; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] 9540; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 9541; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9542; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 9543; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 9544; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 9545; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] 9546; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] 9547; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 9548; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm1 9549; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] 9550; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 9551; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9552; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 9553; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 9554; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 9555; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 9556; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 9557; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9558; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9559; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9560; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9561; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] 9562; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9563; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9564; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9565; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9566; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] 9567; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9568; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 9569; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] 9570; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9571; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 9572; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] 9573; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 9574; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9575; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 9576; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 9577; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 9578; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 9579; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9580; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] 9581; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9582; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 9583; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] 9584; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 9585; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9586; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 9587; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 9588; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 9589; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 9590; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9591; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9592; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9593; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 9594; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 9595; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9596; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9597; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9598; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] 9599; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 9600; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] 9601; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm0 9602; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] 9603; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 9604; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9605; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9606; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 9607; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm0 9608; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] 9609; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 9610; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 9611; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] 9612; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] 9613; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9614; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm5 9615; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] 9616; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 9617; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 9618; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9619; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9620; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9621; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 9622; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9623; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] 9624; AVX2-FP-NEXT: vpshufb %xmm12, %xmm14, %xmm3 9625; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 9626; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] 9627; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] 9628; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] 9629; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9630; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 9631; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 9632; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9633; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 9634; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 9635; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 9636; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] 9637; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm0 9638; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] 9639; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] 9640; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] 9641; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9642; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 9643; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] 9644; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] 9645; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 9646; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 9647; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9648; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 9649; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 9650; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9651; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9652; AVX2-FP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] 9653; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] 9654; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 9655; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 9656; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 9657; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] 9658; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 9659; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9660; AVX2-FP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 9661; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9662; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] 9663; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] 9664; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] 9665; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] 9666; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9667; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm0 9668; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] 9669; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] 9670; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 9671; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] 9672; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] 9673; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm4 9674; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9675; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] 9676; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] 9677; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] 9678; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] 9679; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9680; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 9681; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9682; AVX2-FP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] 9683; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] 9684; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9685; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 9686; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 9687; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] 9688; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] 9689; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9690; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm6 9691; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9692; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] 9693; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] 9694; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] 9695; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] 9696; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 9697; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] 9698; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 9699; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 9700; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 9701; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] 9702; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] 9703; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9704; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 9705; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 9706; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 9707; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] 9708; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9709; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9710; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9711; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 9712; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] 9713; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 9714; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 9715; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 9716; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] 9717; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9718; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 9719; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] 9720; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9721; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9722; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9723; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm11 9724; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] 9725; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 9726; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 9727; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] 9728; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9729; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 9730; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] 9731; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9732; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 9733; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9734; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm13 9735; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] 9736; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 9737; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 9738; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] 9739; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9740; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 9741; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 9742; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9743; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 9744; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9745; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm15 9746; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] 9747; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 9748; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 9749; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] 9750; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 9751; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload 9752; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] 9753; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 9754; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 9755; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 9756; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] 9757; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 9758; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 9759; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] 9760; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 9761; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 9762; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] 9763; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9764; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9765; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] 9766; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 9767; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 9768; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] 9769; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 9770; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 9771; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] 9772; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 9773; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 9774; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] 9775; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9776; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 9777; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] 9778; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9779; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi) 9780; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9781; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) 9782; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9783; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi) 9784; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9785; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) 9786; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9787; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx) 9788; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9789; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) 9790; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9791; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx) 9792; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9793; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) 9794; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9795; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) 9796; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9797; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rcx) 9798; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9799; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rcx) 9800; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9801; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx) 9802; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%r8) 9803; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%r8) 9804; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 9805; AVX2-FP-NEXT: vmovaps %ymm7, 64(%r8) 9806; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8) 9807; AVX2-FP-NEXT: vmovdqa %ymm10, 96(%r9) 9808; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%r9) 9809; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) 9810; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%r9) 9811; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9812; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 9813; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) 9814; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax) 9815; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) 9816; AVX2-FP-NEXT: addq $1304, %rsp # imm = 0x518 9817; AVX2-FP-NEXT: vzeroupper 9818; AVX2-FP-NEXT: retq 9819; 9820; AVX2-FCP-LABEL: load_i16_stride6_vf64: 9821; AVX2-FCP: # %bb.0: 9822; AVX2-FCP-NEXT: subq $1304, %rsp # imm = 0x518 9823; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 9824; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 9825; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm2 9826; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm3 9827; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 9828; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 9829; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 9830; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9831; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm9 9832; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9833; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm6 9834; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 9835; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] 9836; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill 9837; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] 9838; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9839; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] 9840; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9841; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] 9842; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] 9843; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9844; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] 9845; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9846; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] 9847; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9848; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] 9849; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9850; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 9851; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] 9852; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 9853; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm0 9854; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 9855; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] 9856; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] 9857; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] 9858; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 9859; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 9860; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 9861; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9862; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6 9863; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9864; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 9865; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9866; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] 9867; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm6 9868; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 9869; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] 9870; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] 9871; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] 9872; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 9873; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 9874; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9875; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 9876; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9877; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 9878; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9879; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] 9880; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 9881; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 9882; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] 9883; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] 9884; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9885; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] 9886; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm13 9887; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 9888; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9889; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm13 9890; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9891; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12 9892; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9893; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] 9894; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm13 9895; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 9896; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] 9897; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] 9898; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9899; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload 9900; AVX2-FCP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] 9901; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 9902; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 9903; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9904; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 9905; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 9906; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 9907; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] 9908; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 9909; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 9910; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 9911; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9912; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 9913; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 9914; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] 9915; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 9916; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 9917; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9918; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 9919; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 9920; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] 9921; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 9922; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 9923; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9924; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 9925; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 9926; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 9927; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 9928; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 9929; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9930; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9931; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload 9932; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 9933; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 9934; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] 9935; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9936; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 9937; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 9938; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 9939; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] 9940; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 9941; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 9942; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 9943; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] 9944; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9945; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 9946; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] 9947; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0 9948; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9949; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm3 9950; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9951; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 9952; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 9953; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm3 9954; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm4 9955; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9956; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 9957; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] 9958; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 9959; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] 9960; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 9961; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9962; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9963; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 9964; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 9965; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2 9966; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] 9967; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9968; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 9969; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 9970; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 9971; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload 9972; AVX2-FCP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] 9973; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9974; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 9975; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15 9976; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 9977; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 9978; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9979; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm4 9980; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9981; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 9982; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 9983; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 9984; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9985; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 9986; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] 9987; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 9988; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] 9989; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 9990; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9991; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9992; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 9993; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 9994; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 9995; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] 9996; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9997; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 9998; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 9999; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 10000; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10001; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 10002; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] 10003; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10004; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 10005; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 10006; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm3 10007; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10008; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 10009; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10010; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] 10011; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm9 10012; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 10013; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10014; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm10 10015; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] 10016; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10017; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] 10018; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 10019; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10020; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 10021; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10022; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 10023; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10024; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 10025; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm9 10026; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 10027; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 10028; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] 10029; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10030; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10031; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 10032; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10 10033; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] 10034; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm12 10035; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 10036; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7] 10037; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10038; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 10039; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 10040; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm15 10041; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] 10042; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10043; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] 10044; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] 10045; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10046; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] 10047; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 10048; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 10049; AVX2-FCP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] 10050; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] 10051; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 10052; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10053; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 10054; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] 10055; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 10056; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] 10057; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 10058; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10059; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 10060; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 10061; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7] 10062; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10063; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 10064; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10065; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10066; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 10067; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10068; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] 10069; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] 10070; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10071; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 10072; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 10073; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10074; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10075; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 10076; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] 10077; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] 10078; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10079; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 10080; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10081; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10082; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm0 10083; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 10084; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] 10085; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] 10086; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10087; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 10088; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 10089; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 10090; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10091; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 10092; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 10093; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] 10094; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10095; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 10096; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10097; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10098; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 10099; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] 10100; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 10101; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 10102; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 10103; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] 10104; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] 10105; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 10106; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 10107; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10108; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 10109; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10110; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10111; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 10112; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 10113; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 10114; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10115; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10116; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10117; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10118; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] 10119; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 10120; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 10121; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] 10122; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10123; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 10124; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 10125; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] 10126; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 10127; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10128; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10129; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 10130; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 10131; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] 10132; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] 10133; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 10134; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm1 10135; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] 10136; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 10137; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10138; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 10139; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 10140; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 10141; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 10142; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 10143; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10144; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10145; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10146; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 10147; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] 10148; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10149; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10150; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10151; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10152; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] 10153; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10154; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 10155; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] 10156; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10157; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 10158; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] 10159; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 10160; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10161; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10162; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 10163; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 10164; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 10165; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10166; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] 10167; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10168; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 10169; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] 10170; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 10171; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10172; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 10173; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 10174; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] 10175; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 10176; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10177; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10178; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10179; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 10180; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 10181; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10182; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10183; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10184; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] 10185; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 10186; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] 10187; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm0 10188; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] 10189; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] 10190; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10191; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10192; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 10193; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 10194; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] 10195; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 10196; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 10197; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] 10198; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] 10199; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10200; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm5 10201; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] 10202; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] 10203; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 10204; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] 10205; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10206; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10207; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10208; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10209; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] 10210; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm3 10211; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 10212; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] 10213; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] 10214; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] 10215; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10216; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload 10217; AVX2-FCP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 10218; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10219; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 10220; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10221; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 10222; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] 10223; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0 10224; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] 10225; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] 10226; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] 10227; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10228; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 10229; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] 10230; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] 10231; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 10232; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 10233; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10234; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 10235; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 10236; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 10237; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 10238; AVX2-FCP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] 10239; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] 10240; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 10241; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 10242; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 10243; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] 10244; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 10245; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10246; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 10247; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10248; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] 10249; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] 10250; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] 10251; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] 10252; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10253; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm0 10254; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] 10255; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] 10256; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 10257; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] 10258; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] 10259; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm4 10260; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10261; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] 10262; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] 10263; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] 10264; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10265; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10266; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 10267; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 10268; AVX2-FCP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] 10269; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] 10270; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10271; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 10272; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 10273; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] 10274; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] 10275; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10276; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm6 10277; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10278; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] 10279; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] 10280; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] 10281; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] 10282; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 10283; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] 10284; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 10285; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 10286; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 10287; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] 10288; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] 10289; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10290; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 10291; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] 10292; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 10293; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] 10294; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10295; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10296; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 10297; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 10298; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] 10299; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 10300; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 10301; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 10302; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] 10303; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10304; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload 10305; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] 10306; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10307; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10308; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 10309; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 10310; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] 10311; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 10312; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 10313; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] 10314; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10315; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 10316; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] 10317; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10318; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10319; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 10320; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 10321; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] 10322; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 10323; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 10324; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] 10325; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10326; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10327; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] 10328; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10329; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 10330; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 10331; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 10332; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] 10333; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 10334; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 10335; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] 10336; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 10337; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload 10338; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] 10339; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 10340; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 10341; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 10342; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] 10343; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 10344; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 10345; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] 10346; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 10347; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 10348; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] 10349; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10350; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10351; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] 10352; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 10353; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 10354; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] 10355; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10356; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 10357; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] 10358; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 10359; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 10360; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] 10361; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10362; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10363; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] 10364; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10365; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rsi) 10366; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10367; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) 10368; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10369; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rsi) 10370; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10371; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) 10372; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10373; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rdx) 10374; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10375; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) 10376; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10377; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rdx) 10378; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10379; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) 10380; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10381; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) 10382; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10383; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) 10384; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10385; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rcx) 10386; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10387; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) 10388; AVX2-FCP-NEXT: vmovdqa %ymm8, 96(%r8) 10389; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%r8) 10390; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10391; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8) 10392; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8) 10393; AVX2-FCP-NEXT: vmovdqa %ymm10, 96(%r9) 10394; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) 10395; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) 10396; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%r9) 10397; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10398; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 10399; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) 10400; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax) 10401; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax) 10402; AVX2-FCP-NEXT: addq $1304, %rsp # imm = 0x518 10403; AVX2-FCP-NEXT: vzeroupper 10404; AVX2-FCP-NEXT: retq 10405; 10406; AVX512-LABEL: load_i16_stride6_vf64: 10407; AVX512: # %bb.0: 10408; AVX512-NEXT: subq $1480, %rsp # imm = 0x5C8 10409; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 10410; AVX512-NEXT: vmovdqa 608(%rdi), %ymm0 10411; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10412; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 10413; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10414; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 10415; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm0 10416; AVX512-NEXT: vextracti32x4 $1, %ymm1, %xmm20 10417; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16 10418; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3] 10419; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 10420; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 10421; AVX512-NEXT: vmovdqa 544(%rdi), %ymm1 10422; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10423; AVX512-NEXT: vmovdqa 512(%rdi), %ymm2 10424; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10425; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 10426; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2 10427; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] 10428; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 10429; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 10430; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 10431; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 10432; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10433; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 10434; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10435; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0 10436; AVX512-NEXT: vmovdqa 416(%rdi), %ymm1 10437; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10438; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2 10439; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10440; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 10441; AVX512-NEXT: vpshufb %xmm9, %xmm12, %xmm1 10442; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm22 10443; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3] 10444; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 10445; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] 10446; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] 10447; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10448; AVX512-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 10449; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10450; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] 10451; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 10452; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm0 10453; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23 10454; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 10455; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10456; AVX512-NEXT: vmovdqa 640(%rdi), %ymm0 10457; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1 10458; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10459; AVX512-NEXT: vmovdqa 704(%rdi), %ymm2 10460; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10461; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 10462; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm2 10463; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] 10464; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 10465; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28 10466; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 10467; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10468; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] 10469; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10470; AVX512-NEXT: vinserti128 $1, 672(%rdi), %ymm0, %ymm0 10471; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10472; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] 10473; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 10474; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm0 10475; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17 10476; AVX512-NEXT: vmovdqa64 %ymm2, %ymm29 10477; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 10478; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 10479; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10480; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10481; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0 10482; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10483; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 10484; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10485; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 10486; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 10487; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] 10488; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 10489; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm1 10490; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 10491; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 10492; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 10493; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10494; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 10495; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30 10496; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7 10497; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] 10498; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] 10499; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 10500; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10501; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 10502; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10503; AVX512-NEXT: vmovdqa (%rdi), %ymm0 10504; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10505; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 10506; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10507; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 10508; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0 10509; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm9 10510; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] 10511; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 10512; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] 10513; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 10514; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 10515; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10516; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 10517; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10518; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] 10519; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm2 10520; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 10521; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10522; AVX512-NEXT: vmovdqa 352(%rdi), %ymm0 10523; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10524; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1 10525; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 10526; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 10527; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 10528; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] 10529; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7] 10530; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm10 10531; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 10532; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 10533; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10534; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 10535; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] 10536; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31 10537; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 10538; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 10539; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 10540; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 10541; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 10542; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 10543; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10544; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 10545; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm0 10546; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm14 10547; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] 10548; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 10549; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7 10550; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 10551; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] 10552; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 10553; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 10554; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10555; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm0 10556; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3 10557; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] 10558; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 10559; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm4 10560; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] 10561; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10562; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm0 10563; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] 10564; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 10565; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 10566; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 10567; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10568; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 10569; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] 10570; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 10571; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 10572; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 10573; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 10574; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 10575; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 10576; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 10577; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 10578; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm2 10579; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5] 10580; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] 10581; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10582; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 10583; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10584; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 10585; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 10586; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 10587; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 10588; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm3 10589; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] 10590; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 10591; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10592; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0 10593; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 10594; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1 10595; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm1 10596; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] 10597; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 10598; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10599; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 10600; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] 10601; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 10602; AVX512-NEXT: vmovdqa64 %ymm0, %ymm26 10603; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10604; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10605; AVX512-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10606; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 10607; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] 10608; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 10609; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 10610; AVX512-NEXT: vmovdqa64 %xmm1, %xmm28 10611; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 10612; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] 10613; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 10614; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 10615; AVX512-NEXT: vmovdqa64 %ymm30, %ymm2 10616; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload 10617; AVX512-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] 10618; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 10619; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] 10620; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] 10621; AVX512-NEXT: vmovdqa64 %xmm3, %xmm20 10622; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 10623; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] 10624; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] 10625; AVX512-NEXT: vmovdqa64 %xmm3, %xmm19 10626; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 10627; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10628; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 10629; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10630; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10631; AVX512-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10632; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 10633; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] 10634; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 10635; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 10636; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18 10637; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 10638; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] 10639; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16 10640; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 10641; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10642; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 10643; AVX512-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] 10644; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 10645; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm3 10646; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 10647; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] 10648; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 10649; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10650; AVX512-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload 10651; AVX512-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 10652; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 10653; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] 10654; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] 10655; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] 10656; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] 10657; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] 10658; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 10659; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10660; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4 10661; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload 10662; AVX512-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7] 10663; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 10664; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm4 10665; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24 10666; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] 10667; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] 10668; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 10669; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 10670; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 10671; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm29 & (zmm1 ^ zmm2)) 10672; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm22 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 10673; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm1)) 10674; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10675; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10676; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10677; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 10678; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 10679; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] 10680; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] 10681; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] 10682; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 10683; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] 10684; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 10685; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10686; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 10687; AVX512-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] 10688; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 10689; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] 10690; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] 10691; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] 10692; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] 10693; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] 10694; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] 10695; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10696; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 10697; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10698; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 10699; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 10700; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 10701; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] 10702; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] 10703; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] 10704; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 10705; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] 10706; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 10707; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10708; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 10709; AVX512-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] 10710; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 10711; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] 10712; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 10713; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10714; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10715; AVX512-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 10716; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 10717; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] 10718; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] 10719; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 10720; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] 10721; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] 10722; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] 10723; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10724; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10725; AVX512-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 10726; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 10727; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 10728; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 10729; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] 10730; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 10731; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] 10732; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 10733; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm23 ^ (zmm29 & (zmm2 ^ zmm23)) 10734; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm2)) 10735; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10736; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 10737; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 10738; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2 10739; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 10740; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 10741; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 10742; AVX512-NEXT: vmovdqa64 %xmm19, %xmm2 10743; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] 10744; AVX512-NEXT: vmovdqa64 %xmm20, %xmm14 10745; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] 10746; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] 10747; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] 10748; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10749; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm28 10750; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 10751; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] 10752; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 10753; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] 10754; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] 10755; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] 10756; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 10757; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 10758; AVX512-NEXT: vpshufb %ymm14, %ymm0, %ymm0 10759; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] 10760; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 10761; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] 10762; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] 10763; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] 10764; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] 10765; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 10766; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm13 10767; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10768; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] 10769; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] 10770; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 10771; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload 10772; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10773; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload 10774; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10775; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 10776; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10777; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 10778; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 10779; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 10780; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm28 ^ (zmm29 & (zmm0 ^ zmm28)) 10781; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm22 & (zmm25 ^ zmm0)) 10782; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] 10783; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] 10784; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 10785; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 10786; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] 10787; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] 10788; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] 10789; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] 10790; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10791; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 10792; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] 10793; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] 10794; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 10795; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] 10796; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm5 10797; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] 10798; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 10799; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm1 10800; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] 10801; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 10802; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] 10803; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 10804; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10805; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 10806; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] 10807; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 10808; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 10809; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0)) 10810; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm2)) 10811; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10812; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 10813; AVX512-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 10814; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 10815; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm0 10816; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 10817; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] 10818; AVX512-NEXT: vmovdqa64 %xmm4, %xmm24 10819; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] 10820; AVX512-NEXT: vmovdqa64 %ymm30, %ymm4 10821; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload 10822; AVX512-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] 10823; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 10824; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] 10825; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] 10826; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22 10827; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] 10828; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 10829; AVX512-NEXT: vpshufb %xmm13, %xmm5, %xmm4 10830; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20 10831; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] 10832; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10833; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 10834; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10835; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 10836; AVX512-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 10837; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10838; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 10839; AVX512-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 10840; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm0 10841; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm4 10842; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] 10843; AVX512-NEXT: vmovdqa64 %xmm4, %xmm21 10844; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] 10845; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 10846; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] 10847; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm4 10848; AVX512-NEXT: vmovdqa64 %ymm5, %ymm26 10849; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm4 10850; AVX512-NEXT: movw $31, %ax 10851; AVX512-NEXT: kmovw %eax, %k1 10852; AVX512-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} 10853; AVX512-NEXT: vmovdqa64 %ymm31, %ymm3 10854; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload 10855; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] 10856; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10857; AVX512-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload 10858; AVX512-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] 10859; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 10860; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] 10861; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] 10862; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27 10863; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] 10864; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm4 10865; AVX512-NEXT: vmovdqa64 %xmm6, %xmm18 10866; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] 10867; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10868; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 10869; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm4 10870; AVX512-NEXT: vmovdqa64 %ymm6, %ymm16 10871; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 10872; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] 10873; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23 10874; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10875; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload 10876; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] 10877; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10878; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload 10879; AVX512-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] 10880; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm3 10881; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm14 10882; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] 10883; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 10884; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 10885; AVX512-NEXT: vmovdqa64 %ymm5, %ymm17 10886; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm29) | ymm0 10887; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10888; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 10889; AVX512-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 10890; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 10891; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm9 10892; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] 10893; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] 10894; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10895; AVX512-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 10896; AVX512-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 10897; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm12 10898; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] 10899; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] 10900; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] 10901; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm6 10902; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] 10903; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10904; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 10905; AVX512-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} 10906; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10907; AVX512-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload 10908; AVX512-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] 10909; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10910; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 10911; AVX512-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 10912; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm7 10913; AVX512-NEXT: vpshufb %xmm13, %xmm7, %xmm3 10914; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] 10915; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] 10916; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 10917; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] 10918; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0 10919; AVX512-NEXT: vpshufb %ymm0, %ymm8, %ymm3 10920; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10921; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7] 10922; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 10923; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 10924; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 10925; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] 10926; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 10927; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] 10928; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 10929; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 10930; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3 10931; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0 10932; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] 10933; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] 10934; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] 10935; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10936; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 10937; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3 10938; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm21[1,1,2,3] 10939; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 10940; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7] 10941; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] 10942; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 10943; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm13 10944; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm13 10945; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} 10946; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 10947; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3 10948; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0 10949; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] 10950; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] 10951; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] 10952; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 10953; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0 10954; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm0 10955; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10956; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] 10957; AVX512-NEXT: vmovdqa64 %ymm17, %ymm3 10958; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm3 10959; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm10 10960; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] 10961; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] 10962; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] 10963; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm29) | ymm3 10964; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 10965; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] 10966; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 10967; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] 10968; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm3 10969; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm5 10970; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 10971; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm9 10972; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] 10973; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] 10974; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] 10975; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10976; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 10977; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} 10978; AVX512-NEXT: vpshufb %ymm13, %ymm8, %ymm8 10979; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 10980; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] 10981; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] 10982; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] 10983; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 10984; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] 10985; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 10986; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 10987; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10988; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload 10989; AVX512-NEXT: # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem)) 10990; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 10991; AVX512-NEXT: kmovw %eax, %k1 10992; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 10993; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 10994; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) 10995; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 10996; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload 10997; AVX512-NEXT: # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem)) 10998; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 10999; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 11000; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) 11001; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11002; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload 11003; AVX512-NEXT: # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem)) 11004; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 11005; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} 11006; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rdx) 11007; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 11008; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload 11009; AVX512-NEXT: # zmm7 = mem ^ (zmm6 & (zmm7 ^ mem)) 11010; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11011; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} 11012; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) 11013; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 11014; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ zmm30)) 11015; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm31)) 11016; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) 11017; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm6 & (zmm4 ^ zmm1)) 11018; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11019; AVX512-NEXT: vmovaps %zmm1, 64(%rcx) 11020; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 11021; AVX512-NEXT: vmovaps %zmm1, (%rcx) 11022; AVX512-NEXT: vmovdqa64 %zmm28, 64(%r8) 11023; AVX512-NEXT: vmovdqa64 %zmm25, (%r8) 11024; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r9) 11025; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) 11026; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 11027; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) 11028; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 11029; AVX512-NEXT: addq $1480, %rsp # imm = 0x5C8 11030; AVX512-NEXT: vzeroupper 11031; AVX512-NEXT: retq 11032; 11033; AVX512-FCP-LABEL: load_i16_stride6_vf64: 11034; AVX512-FCP: # %bb.0: 11035; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588 11036; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 11037; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 11038; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11039; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 11040; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11041; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 11042; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0 11043; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 11044; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20 11045; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] 11046; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 11047; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 11048; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 11049; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 11050; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11051; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 11052; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11053; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 11054; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 11055; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 11056; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 11057; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2 11058; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 11059; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 11060; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11061; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 11062; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11063; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 11064; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 11065; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11066; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 11067; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11068; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 11069; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm1 11070; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 11071; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] 11072; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 11073; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 11074; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] 11075; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] 11076; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11077; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 11078; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11079; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] 11080; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 11081; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0 11082; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 11083; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 11084; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11085; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 11086; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11087; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm1 11088; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11089; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 11090; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm0 11091; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm2 11092; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 11093; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 11094; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] 11095; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm1 11096; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11097; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] 11098; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11099; AVX512-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 11100; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11101; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] 11102; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 11103; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 11104; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16 11105; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 11106; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 11107; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] 11108; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11109; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11110; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 11111; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11112; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 11113; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11114; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 11115; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm0 11116; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm1 11117; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3] 11118; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm0 11119; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 11120; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 11121; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11122; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 11123; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 11124; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 11125; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 11126; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 11127; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2 11128; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 11129; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11130; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 11131; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11132; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 11133; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11134; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 11135; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11136; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 11137; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 11138; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 11139; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] 11140; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0 11141; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 11142; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 11143; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 11144; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11145; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 11146; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11147; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] 11148; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2 11149; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 11150; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11151; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 11152; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 11153; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 11154; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11155; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 11156; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 11157; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 11158; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 11159; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] 11160; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 11161; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 11162; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11163; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 11164; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] 11165; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 11166; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 11167; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 11168; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 11169; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 11170; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 11171; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 11172; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11173; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 11174; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 11175; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm14 11176; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7] 11177; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 11178; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 11179; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 11180; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] 11181; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 11182; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 11183; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11184; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm0 11185; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm3 11186; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] 11187; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 11188; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5 11189; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] 11190; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11191; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm0 11192; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] 11193; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 11194; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 11195; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 11196; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11197; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 11198; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] 11199; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 11200; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 11201; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 11202; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 11203; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 11204; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 11205; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] 11206; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 11207; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 11208; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5] 11209; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] 11210; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11211; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 11212; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11213; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 11214; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 11215; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm2 11216; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 11217; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 11218; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 11219; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 11220; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11221; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 11222; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 11223; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 11224; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 11225; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] 11226; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 11227; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11228; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 11229; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] 11230; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11231; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 11232; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11233; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11234; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11235; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 11236; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] 11237; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] 11238; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] 11239; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1 11240; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 11241; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] 11242; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 11243; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 11244; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 11245; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11246; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] 11247; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 11248; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] 11249; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 11250; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm2 11251; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 11252; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] 11253; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] 11254; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20 11255; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] 11256; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11257; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 11258; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11259; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11260; AVX512-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 11261; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 11262; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 11263; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 11264; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 11265; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1 11266; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm19 11267; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] 11268; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm18 11269; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] 11270; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11271; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 11272; AVX512-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] 11273; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 11274; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm5 11275; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 11276; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 11277; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 11278; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11279; AVX512-FCP-NEXT: vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 11280; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] 11281; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 11282; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] 11283; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm5 11284; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 11285; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] 11286; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4] 11287; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] 11288; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 11289; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm6 11290; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload 11291; AVX512-FCP-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] 11292; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 11293; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 11294; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 11295; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] 11296; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 11297; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 11298; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 11299; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 11300; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm29 & (zmm4 ^ zmm3)) 11301; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 11302; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm26 & (zmm5 ^ zmm4)) 11303; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11304; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11305; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11306; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 11307; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 11308; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] 11309; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 11310; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3] 11311; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] 11312; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] 11313; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11314; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11315; AVX512-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] 11316; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 11317; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] 11318; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm3 11319; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] 11320; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4] 11321; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 11322; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 11323; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 11324; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11325; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11326; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11327; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 11328; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] 11329; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 11330; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] 11331; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] 11332; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 11333; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11334; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 11335; AVX512-FCP-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] 11336; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 11337; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 11338; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 11339; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11340; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11341; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11342; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 11343; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] 11344; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm0 11345; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] 11346; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] 11347; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] 11348; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11349; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 11350; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 11351; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 11352; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 11353; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 11354; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] 11355; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 11356; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] 11357; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 11358; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm11 ^ (zmm29 & (zmm1 ^ zmm11)) 11359; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm26 & (zmm25 ^ zmm1)) 11360; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] 11361; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 11362; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11363; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 11364; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 11365; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] 11366; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 11367; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 11368; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm11 11369; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm11 11370; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13 11371; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] 11372; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] 11373; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 11374; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 11375; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 11376; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 11377; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 11378; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm19 11379; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm0 11380; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] 11381; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] 11382; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 11383; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 11384; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 11385; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] 11386; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] 11387; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm13 11388; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13 11389; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] 11390; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7] 11391; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 11392; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15 11393; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 11394; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] 11395; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] 11396; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] 11397; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm23 11398; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm29 & (zmm0 ^ zmm24)) 11399; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm26 & (zmm23 ^ zmm0)) 11400; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 11401; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 11402; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload 11403; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11404; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload 11405; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11406; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10 11407; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11408; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10 11409; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11410; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7] 11411; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] 11412; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 11413; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] 11414; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] 11415; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 11416; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 11417; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 11418; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 11419; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] 11420; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] 11421; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 11422; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] 11423; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 11424; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 11425; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 11426; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] 11427; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] 11428; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11429; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 11430; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] 11431; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 11432; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 11433; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm29 & (zmm5 ^ zmm0)) 11434; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm26 & (zmm28 ^ zmm5)) 11435; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11436; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload 11437; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 11438; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 11439; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm0 11440; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 11441; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] 11442; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 11443; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 11444; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 11445; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11446; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] 11447; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 11448; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 11449; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 11450; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 11451; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 11452; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm3 11453; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 11454; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] 11455; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11456; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 11457; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11458; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 11459; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 11460; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11461; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 11462; AVX512-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11463; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 11464; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 11465; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] 11466; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 11467; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 11468; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] 11469; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 11470; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 11471; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm29) | ymm3 11472; AVX512-FCP-NEXT: movw $31, %ax 11473; AVX512-FCP-NEXT: kmovw %eax, %k1 11474; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} 11475; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 11476; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload 11477; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] 11478; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11479; AVX512-FCP-NEXT: vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 11480; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] 11481; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 11482; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] 11483; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 11484; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm20 11485; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm3 11486; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 11487; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] 11488; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11489; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 11490; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3 11491; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm16 11492; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 11493; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 11494; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 11495; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11496; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload 11497; AVX512-FCP-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] 11498; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11499; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 11500; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] 11501; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm3 11502; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13 11503; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] 11504; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 11505; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 11506; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm0 11507; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11508; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 11509; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 11510; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 11511; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 11512; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7] 11513; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] 11514; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11515; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 11516; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 11517; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 11518; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] 11519; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm4 11520; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm5 11521; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] 11522; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 11523; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm31 11524; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} 11525; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11526; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 11527; AVX512-FCP-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] 11528; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11529; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11530; AVX512-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 11531; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 11532; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 11533; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm4 11534; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 11535; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] 11536; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 11537; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm12 11538; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 11539; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7] 11540; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 11541; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 11542; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm14 11543; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm10 11544; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] 11545; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] 11546; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 11547; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10 11548; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 11549; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm29) | ymm10 11550; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 11551; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10 11552; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11 11553; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] 11554; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 11555; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 11556; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 11557; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 11558; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 11559; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7] 11560; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11561; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 11562; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm0 {%k1} 11563; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 11564; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 11565; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm12 11566; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12 11567; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] 11568; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 11569; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 11570; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 11571; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 11572; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7] 11573; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 11574; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 11575; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 11576; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 11577; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] 11578; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm29) | ymm14 11579; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7 11580; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 11581; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] 11582; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm4 11583; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 11584; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 11585; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 11586; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 11587; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7] 11588; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 11589; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 11590; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1} 11591; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 11592; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 11593; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11594; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] 11595; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11596; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] 11597; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 11598; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 11599; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11600; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload 11601; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) 11602; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 11603; AVX512-FCP-NEXT: kmovw %eax, %k1 11604; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11605; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} 11606; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) 11607; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11608; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload 11609; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) 11610; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11611; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} 11612; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) 11613; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11614; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload 11615; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) 11616; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 11617; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} 11618; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) 11619; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload 11620; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload 11621; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) 11622; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 11623; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} 11624; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) 11625; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm30)) 11626; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm31)) 11627; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm0)) 11628; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm26 & (zmm1 ^ zmm2)) 11629; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rcx) 11630; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 11631; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) 11632; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%r8) 11633; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) 11634; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) 11635; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 11636; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11637; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 11638; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) 11639; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588 11640; AVX512-FCP-NEXT: vzeroupper 11641; AVX512-FCP-NEXT: retq 11642; 11643; AVX512DQ-LABEL: load_i16_stride6_vf64: 11644; AVX512DQ: # %bb.0: 11645; AVX512DQ-NEXT: subq $840, %rsp # imm = 0x348 11646; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] 11647; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm0 11648; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11649; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1 11650; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11651; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 11652; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm0 11653; AVX512DQ-NEXT: vextracti32x4 $1, %ymm1, %xmm24 11654; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25 11655; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3] 11656; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 11657; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 11658; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm1 11659; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11660; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm2 11661; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11662; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 11663; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 11664; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] 11665; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23 11666; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] 11667; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 11668; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] 11669; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11670; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11671; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 11672; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm1 11673; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2 11674; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11675; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm3 11676; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11677; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 11678; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm2 11679; AVX512DQ-NEXT: vextracti32x4 $1, %ymm15, %xmm21 11680; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] 11681; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 11682; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 11683; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] 11684; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11685; AVX512DQ-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 11686; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11687; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] 11688; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 11689; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm1 11690; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 11691; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] 11692; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm1 11693; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm2 11694; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11695; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm4 11696; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11697; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] 11698; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm4 11699; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] 11700; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] 11701; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19 11702; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 11703; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11704; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] 11705; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11706; AVX512DQ-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 11707; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11708; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7] 11709; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 11710; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm1 11711; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm27 11712; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 11713; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 11714; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] 11715; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 11716; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 11717; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0)) 11718; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 11719; AVX512DQ-NEXT: kmovw %eax, %k1 11720; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} 11721; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11722; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm0 11723; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11724; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 11725; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11726; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 11727; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 11728; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] 11729; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 11730; AVX512DQ-NEXT: vpshufb %xmm9, %xmm13, %xmm1 11731; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 11732; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 11733; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11734; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 11735; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11736; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 11737; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm11 11738; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] 11739; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7] 11740; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11741; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11742; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 11743; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 11744; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 11745; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 11746; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11747; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 11748; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm0 11749; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9 11750; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] 11751; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 11752; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] 11753; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2 11754; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 11755; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 11756; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] 11757; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31 11758; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm30 11759; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm2 11760; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7] 11761; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 11762; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11763; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1 11764; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11765; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 11766; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8 11767; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] 11768; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7] 11769; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm10 11770; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 11771; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 11772; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 11773; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] 11774; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26 11775; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29 11776; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0 11777; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 11778; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 11779; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 11780; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 11781; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 11782; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm17 ^ (zmm16 & (zmm7 ^ zmm17)) 11783; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} 11784; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11785; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] 11786; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 11787; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm13 11788; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] 11789; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 11790; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm7 11791; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 11792; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] 11793; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 11794; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 11795; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm4 11796; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3 11797; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] 11798; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 11799; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm6 11800; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] 11801; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm3 11802; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] 11803; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] 11804; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 11805; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 11806; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11807; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 11808; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] 11809; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 11810; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0)) 11811; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} 11812; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11813; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0 11814; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0 11815; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 11816; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11817; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 11818; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm1 11819; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 11820; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2 11821; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 11822; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 11823; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11824; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 11825; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 11826; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 11827; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 11828; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 11829; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm4 11830; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] 11831; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] 11832; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1 11833; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 11834; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm2 11835; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 11836; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] 11837; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] 11838; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11839; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 11840; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 11841; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 11842; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0)) 11843; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} 11844; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11845; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11846; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11847; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 11848; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 11849; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] 11850; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 11851; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 11852; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 11853; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 11854; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] 11855; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22 11856; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 11857; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11858; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11859; AVX512DQ-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 11860; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 11861; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] 11862; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] 11863; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23 11864; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 11865; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] 11866; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] 11867; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm24 11868; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] 11869; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11870; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 11871; AVX512DQ-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 11872; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11873; AVX512DQ-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11874; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 11875; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] 11876; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 11877; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 11878; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25 11879; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] 11880; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] 11881; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16 11882; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 11883; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0 11884; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm3 11885; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7] 11886; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 11887; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm3 11888; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 11889; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] 11890; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 11891; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11892; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 11893; AVX512DQ-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 11894; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 11895; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] 11896; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] 11897; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] 11898; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] 11899; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] 11900; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 11901; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 11902; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4 11903; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm5 11904; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7] 11905; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 11906; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm4 11907; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm27 11908; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] 11909; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] 11910; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 11911; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 11912; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 11913; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm21 & (zmm1 ^ zmm2)) 11914; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm18 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 11915; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm1)) 11916; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11917; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11918; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11919; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 11920; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 11921; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] 11922; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] 11923; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] 11924; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 11925; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] 11926; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 11927; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11928; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 11929; AVX512DQ-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] 11930; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 11931; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] 11932; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] 11933; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] 11934; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] 11935; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] 11936; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] 11937; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11938; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 11939; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11940; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 11941; AVX512DQ-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 11942; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 11943; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] 11944; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] 11945; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] 11946; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] 11947; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] 11948; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 11949; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11950; AVX512DQ-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 11951; AVX512DQ-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] 11952; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 11953; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] 11954; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] 11955; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11956; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11957; AVX512DQ-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 11958; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm0 11959; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] 11960; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] 11961; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] 11962; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] 11963; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] 11964; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] 11965; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11966; AVX512DQ-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11967; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] 11968; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0 11969; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 11970; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 11971; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] 11972; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 11973; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] 11974; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 11975; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (zmm21 & (zmm2 ^ zmm19)) 11976; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm2)) 11977; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 11978; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 11979; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 11980; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 11981; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 11982; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 11983; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 11984; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] 11985; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm14 11986; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] 11987; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] 11988; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] 11989; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11990; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20 11991; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 11992; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] 11993; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0 11994; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] 11995; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] 11996; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] 11997; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 11998; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0 11999; AVX512DQ-NEXT: vpshufb %ymm14, %ymm0, %ymm0 12000; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] 12001; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 12002; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] 12003; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] 12004; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] 12005; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] 12006; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 12007; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm13 12008; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12009; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] 12010; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] 12011; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] 12012; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 12013; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm21 & (zmm0 ^ zmm20)) 12014; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm18 & (zmm27 ^ zmm0)) 12015; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] 12016; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] 12017; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 12018; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 12019; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] 12020; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] 12021; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] 12022; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] 12023; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12024; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 12025; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] 12026; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] 12027; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 12028; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] 12029; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm5 12030; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] 12031; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 12032; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1 12033; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] 12034; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 12035; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] 12036; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 12037; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12038; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 12039; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] 12040; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 12041; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 12042; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm21 & (zmm2 ^ zmm0)) 12043; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm2)) 12044; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12045; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 12046; AVX512DQ-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12047; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 12048; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm0 12049; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm13 12050; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] 12051; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 12052; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12053; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12054; AVX512DQ-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 12055; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm5 12056; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 12057; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] 12058; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19 12059; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] 12060; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] 12061; AVX512DQ-NEXT: vpshufb %xmm11, %xmm5, %xmm4 12062; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18 12063; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] 12064; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12065; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm23 12066; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0 12067; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm2 12068; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] 12069; AVX512DQ-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 12070; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 12071; AVX512DQ-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 12072; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm0 12073; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm4 12074; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] 12075; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm30 12076; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 12077; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 12078; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] 12079; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm4 12080; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm31 12081; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm22) | ymm4 12082; AVX512DQ-NEXT: movw $31, %ax 12083; AVX512DQ-NEXT: kmovw %eax, %k1 12084; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} 12085; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm2 12086; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm4 12087; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2,3,4,5],ymm2[6],ymm4[7] 12088; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12089; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 12090; AVX512DQ-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 12091; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm6 12092; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 12093; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] 12094; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 12095; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] 12096; AVX512DQ-NEXT: vpshufb %xmm11, %xmm6, %xmm4 12097; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm16 12098; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] 12099; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12100; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 12101; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm4 12102; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm21 12103; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm17 12104; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] 12105; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 12106; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12107; AVX512DQ-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload 12108; AVX512DQ-NEXT: # ymm14 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] 12109; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12110; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload 12111; AVX512DQ-NEXT: # ymm15 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] 12112; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm2 12113; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12 12114; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] 12115; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] 12116; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 12117; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm22) | ymm0 12118; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12119; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 12120; AVX512DQ-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12121; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 12122; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm10 12123; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,2,2,2,4,5,6,7] 12124; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] 12125; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12126; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 12127; AVX512DQ-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 12128; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm8 12129; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] 12130; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] 12131; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] 12132; AVX512DQ-NEXT: vpshufb %xmm11, %xmm8, %xmm5 12133; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] 12134; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 12135; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm25 12136; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1} 12137; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12138; AVX512DQ-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 12139; AVX512DQ-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] 12140; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12141; AVX512DQ-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12142; AVX512DQ-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 12143; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 12144; AVX512DQ-NEXT: vpshufb %xmm11, %xmm4, %xmm11 12145; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] 12146; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7] 12147; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] 12148; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7] 12149; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0 12150; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm11 12151; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12152; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7] 12153; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 12154; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] 12155; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3 12156; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] 12157; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 12158; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] 12159; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] 12160; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0 12161; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 12162; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm13 12163; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] 12164; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] 12165; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] 12166; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12167; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm26 12168; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm9 12169; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3] 12170; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] 12171; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] 12172; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] 12173; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0 12174; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm13 12175; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm22) | ymm13 12176; AVX512DQ-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} 12177; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 12178; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm9 12179; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 12180; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] 12181; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] 12182; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6,7] 12183; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 12184; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0 12185; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm0 12186; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 12187; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] 12188; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 12189; AVX512DQ-NEXT: vpshufb %ymm11, %ymm14, %ymm9 12190; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm11 12191; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] 12192; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] 12193; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] 12194; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (ymm11 & ymm22) | ymm9 12195; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 12196; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] 12197; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 12198; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] 12199; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm6 12200; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] 12201; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] 12202; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] 12203; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 12204; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 12205; AVX512DQ-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1} 12206; AVX512DQ-NEXT: vpshufb %ymm13, %ymm5, %ymm5 12207; AVX512DQ-NEXT: vpshufb %xmm3, %xmm4, %xmm3 12208; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] 12209; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] 12210; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] 12211; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12212; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] 12213; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 12214; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12215; AVX512DQ-NEXT: vmovaps %zmm3, (%rsi) 12216; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12217; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rsi) 12218; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12219; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rdx) 12220; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12221; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx) 12222; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 12223; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm3 & (zmm24 ^ zmm23)) 12224; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm3 & (zmm21 ^ zmm25)) 12225; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm3 & (zmm0 ^ zmm26)) 12226; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm1)) 12227; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rcx) 12228; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 12229; AVX512DQ-NEXT: vmovaps %zmm1, (%rcx) 12230; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%r8) 12231; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r8) 12232; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%r9) 12233; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) 12234; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 12235; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) 12236; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 12237; AVX512DQ-NEXT: addq $840, %rsp # imm = 0x348 12238; AVX512DQ-NEXT: vzeroupper 12239; AVX512DQ-NEXT: retq 12240; 12241; AVX512DQ-FCP-LABEL: load_i16_stride6_vf64: 12242; AVX512DQ-FCP: # %bb.0: 12243; AVX512DQ-FCP-NEXT: subq $872, %rsp # imm = 0x368 12244; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] 12245; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 12246; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12247; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 12248; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12249; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 12250; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0 12251; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 12252; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 12253; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] 12254; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1 12255; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 12256; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 12257; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 12258; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12259; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 12260; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12261; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 12262; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] 12263; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 12264; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 12265; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 12266; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2 12267; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 12268; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 12269; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12270; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 12271; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 12272; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 12273; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12274; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 12275; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12276; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 12277; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm2 12278; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3 12279; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] 12280; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 12281; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 12282; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] 12283; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] 12284; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12285; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 12286; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12287; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] 12288; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] 12289; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 12290; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 12291; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] 12292; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm1 12293; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12294; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 12295; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12296; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 12297; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 12298; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4 12299; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 12300; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm19 12301; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 12302; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm2 12303; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12304; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] 12305; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12306; AVX512DQ-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 12307; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12308; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] 12309; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] 12310; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 12311; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 12312; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 12313; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 12314; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] 12315; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 12316; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 12317; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0)) 12318; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 12319; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 12320; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} 12321; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12322; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 12323; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12324; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 12325; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12326; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 12327; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0 12328; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm1 12329; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] 12330; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0 12331; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] 12332; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 12333; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12334; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 12335; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 12336; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 12337; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 12338; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 12339; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2 12340; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 12341; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12342; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 12343; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 12344; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12345; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 12346; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12347; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 12348; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 12349; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2 12350; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3] 12351; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm0 12352; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] 12353; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 12354; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 12355; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12356; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 12357; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] 12358; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 12359; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm2 12360; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7] 12361; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 12362; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12363; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 12364; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12365; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 12366; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 12367; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 12368; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10 12369; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] 12370; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 12371; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] 12372; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 12373; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] 12374; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 12375; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm30 12376; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 12377; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 12378; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 12379; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 12380; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] 12381; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 12382; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm17 ^ (zmm16 & (zmm6 ^ zmm17)) 12383; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} 12384; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12385; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] 12386; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0 12387; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm13 12388; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7] 12389; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] 12390; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm7 12391; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 12392; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] 12393; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 12394; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 12395; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 12396; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5 12397; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] 12398; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] 12399; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm6 12400; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] 12401; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm3 12402; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] 12403; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] 12404; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] 12405; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 12406; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12407; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] 12408; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] 12409; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 12410; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0)) 12411; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} 12412; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12413; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 12414; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 12415; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 12416; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 12417; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] 12418; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 12419; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 12420; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 12421; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 12422; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] 12423; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12424; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 12425; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 12426; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 12427; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm2 12428; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 12429; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 12430; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] 12431; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] 12432; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 12433; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 12434; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 12435; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 12436; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] 12437; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] 12438; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12439; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 12440; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] 12441; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 12442; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0)) 12443; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} 12444; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12445; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12446; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12447; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 12448; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 12449; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] 12450; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] 12451; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1] 12452; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 12453; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] 12454; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] 12455; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12456; AVX512DQ-FCP-NEXT: vpblendd $36, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 12457; AVX512DQ-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] 12458; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 12459; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] 12460; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] 12461; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 12462; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm19 12463; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] 12464; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] 12465; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 12466; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] 12467; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12468; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 12469; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 12470; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 12471; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 12472; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 12473; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 12474; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] 12475; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 12476; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1 12477; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 12478; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] 12479; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 12480; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] 12481; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 12482; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload 12483; AVX512DQ-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] 12484; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] 12485; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm5 12486; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 12487; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 12488; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 12489; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 12490; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 12491; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] 12492; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 12493; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] 12494; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm5 12495; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm25 12496; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] 12497; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] 12498; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 12499; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] 12500; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 12501; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm6 12502; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm7 12503; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] 12504; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] 12505; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 12506; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm26 12507; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 12508; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] 12509; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] 12510; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 12511; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 12512; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 12513; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm20 & (zmm4 ^ zmm3)) 12514; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 12515; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm4)) 12516; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12517; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12518; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12519; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 12520; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 12521; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 12522; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 12523; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16 12524; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3] 12525; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7] 12526; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] 12527; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 12528; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 12529; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 12530; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 12531; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] 12532; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm3 12533; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] 12534; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4] 12535; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 12536; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12537; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9 12538; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12539; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12540; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 12541; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 12542; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] 12543; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 12544; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] 12545; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7] 12546; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] 12547; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12548; AVX512DQ-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload 12549; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] 12550; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 12551; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 12552; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 12553; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12554; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12555; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12556; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 12557; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] 12558; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0 12559; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] 12560; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4] 12561; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] 12562; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12563; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 12564; AVX512DQ-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] 12565; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 12566; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 12567; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 12568; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] 12569; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] 12570; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 12571; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 12572; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm20 & (zmm1 ^ zmm9)) 12573; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm28 & (zmm27 ^ zmm1)) 12574; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] 12575; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0 12576; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 12577; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] 12578; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 12579; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] 12580; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 12581; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm9 12582; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm1 12583; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5] 12584; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] 12585; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 12586; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 12587; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] 12588; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 12589; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm9 12590; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 12591; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] 12592; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7] 12593; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] 12594; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 12595; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 12596; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] 12597; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7] 12598; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm0 12599; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm15 12600; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm0 12601; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] 12602; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] 12603; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] 12604; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 12605; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 12606; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 12607; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] 12608; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] 12609; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] 12610; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19 12611; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm21 ^ (zmm20 & (zmm1 ^ zmm21)) 12612; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm1)) 12613; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 12614; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 12615; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 12616; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] 12617; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 12618; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm1 12619; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] 12620; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] 12621; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12622; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 12623; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm1 12624; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] 12625; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7] 12626; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 12627; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 12628; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 12629; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 12630; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 12631; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] 12632; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] 12633; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12634; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] 12635; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] 12636; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 12637; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 12638; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm20 & (zmm1 ^ zmm0)) 12639; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm1)) 12640; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12641; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 12642; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12643; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] 12644; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0 12645; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 12646; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] 12647; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 12648; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 12649; AVX512DQ-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 12650; AVX512DQ-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] 12651; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 12652; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] 12653; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] 12654; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm2 12655; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 12656; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm3 12657; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 12658; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] 12659; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12660; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 12661; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 12662; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload 12663; AVX512DQ-FCP-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 12664; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12665; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 12666; AVX512DQ-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12667; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 12668; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 12669; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] 12670; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] 12671; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 12672; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] 12673; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3 12674; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm3 12675; AVX512DQ-FCP-NEXT: movw $31, %ax 12676; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 12677; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} 12678; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 12679; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 12680; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7] 12681; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12682; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 12683; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] 12684; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 12685; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] 12686; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm0 12687; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 12688; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm3 12689; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm29 12690; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] 12691; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12692; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] 12693; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3 12694; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 12695; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 12696; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 12697; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 12698; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12699; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload 12700; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] 12701; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12702; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload 12703; AVX512DQ-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] 12704; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 12705; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 12706; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,2,2,2,4,5,6,7] 12707; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 12708; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] 12709; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 12710; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31 12711; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 12712; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12713; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload 12714; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 12715; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 12716; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 12717; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7] 12718; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] 12719; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12720; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 12721; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 12722; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 12723; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1] 12724; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm4 12725; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm12 12726; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] 12727; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 12728; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm24 12729; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} 12730; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12731; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 12732; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] 12733; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 12734; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 12735; AVX512DQ-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] 12736; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 12737; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 12738; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm12 12739; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13 12740; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7] 12741; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 12742; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm13 12743; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 12744; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7] 12745; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm13 12746; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] 12747; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm15 12748; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 12749; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7] 12750; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] 12751; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm14 12752; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 12753; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm14 12754; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm14 12755; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 12756; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] 12757; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] 12758; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 12759; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 12760; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 12761; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 12762; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] 12763; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12764; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm0 12765; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1} 12766; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 12767; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm10 12768; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 12769; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm14 12770; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] 12771; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] 12772; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm15 12773; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15 12774; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 12775; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7] 12776; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 12777; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm15 12778; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 12779; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm15 12780; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11 12781; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 12782; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 12783; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] 12784; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm20) | ymm15 12785; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 12786; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 12787; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7] 12788; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 12789; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 12790; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7] 12791; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 12792; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 12793; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1} 12794; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 12795; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 12796; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 12797; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] 12798; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12799; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] 12800; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 12801; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12802; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) 12803; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12804; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi) 12805; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12806; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) 12807; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 12808; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) 12809; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm22)) 12810; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm28 & (zmm13 ^ zmm24)) 12811; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm28 & (zmm10 ^ zmm0)) 12812; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ zmm2)) 12813; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) 12814; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12815; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) 12816; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) 12817; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) 12818; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r9) 12819; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 12820; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12821; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) 12822; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) 12823; AVX512DQ-FCP-NEXT: addq $872, %rsp # imm = 0x368 12824; AVX512DQ-FCP-NEXT: vzeroupper 12825; AVX512DQ-FCP-NEXT: retq 12826; 12827; AVX512BW-LABEL: load_i16_stride6_vf64: 12828; AVX512BW: # %bb.0: 12829; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 12830; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 12831; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 12832; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 12833; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 12834; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 12835; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 12836; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 12837; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 12838; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 12839; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 12840; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 12841; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 12842; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 12843; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 12844; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 12845; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 12846; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 12847; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 12848; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 12849; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 12850; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 12851; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 12852; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 12853; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 12854; AVX512BW-NEXT: kmovd %edi, %k1 12855; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} 12856; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 12857; AVX512BW-NEXT: kmovd %edi, %k2 12858; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm8 {%k2} 12859; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm14 12860; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 12861; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 12862; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} 12863; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} 12864; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 12865; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 12866; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 12867; AVX512BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 12868; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 12869; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 12870; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 12871; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 12872; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 12873; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 12874; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 12875; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} 12876; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} 12877; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 12878; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm18 12879; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 12880; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm14 {%k1} 12881; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm14 {%k2} 12882; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 12883; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 12884; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 12885; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 12886; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 12887; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 12888; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 12889; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 12890; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 12891; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 12892; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 12893; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 12894; AVX512BW-NEXT: kmovd %edi, %k2 12895; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} 12896; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 12897; AVX512BW-NEXT: kmovd %edi, %k1 12898; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} 12899; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 12900; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 12901; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 12902; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} 12903; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} 12904; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 12905; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 12906; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 12907; AVX512BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 12908; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 12909; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 12910; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 12911; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 12912; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 12913; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 12914; AVX512BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 12915; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 12916; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} 12917; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 12918; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 12919; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 12920; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} 12921; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} 12922; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 12923; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 12924; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 12925; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 12926; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 12927; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 12928; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 12929; AVX512BW-NEXT: movw $31, %di 12930; AVX512BW-NEXT: kmovd %edi, %k2 12931; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} 12932; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 12933; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 12934; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 12935; AVX512BW-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 12936; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} 12937; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 12938; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 12939; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 12940; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} 12941; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} 12942; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 12943; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 12944; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 12945; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 12946; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 12947; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} 12948; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 12949; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 12950; AVX512BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 12951; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} 12952; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 12953; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 12954; AVX512BW-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 12955; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} 12956; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} 12957; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 12958; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) 12959; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) 12960; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) 12961; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) 12962; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) 12963; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) 12964; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) 12965; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) 12966; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) 12967; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 12968; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) 12969; AVX512BW-NEXT: vzeroupper 12970; AVX512BW-NEXT: retq 12971; 12972; AVX512BW-FCP-LABEL: load_i16_stride6_vf64: 12973; AVX512BW-FCP: # %bb.0: 12974; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12975; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 12976; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 12977; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 12978; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 12979; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 12980; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 12981; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 12982; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 12983; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 12984; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 12985; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 12986; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 12987; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 12988; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 12989; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 12990; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 12991; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 12992; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 12993; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 12994; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 12995; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 12996; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 12997; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 12998; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 12999; AVX512BW-FCP-NEXT: kmovd %edi, %k1 13000; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} 13001; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 13002; AVX512BW-FCP-NEXT: kmovd %edi, %k2 13003; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k2} 13004; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm14 13005; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 13006; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 13007; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} 13008; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} 13009; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 13010; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 13011; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 13012; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 13013; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 13014; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13015; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 13016; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 13017; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 13018; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 13019; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 13020; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} 13021; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} 13022; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 13023; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm18 13024; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 13025; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm14 {%k1} 13026; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm14 {%k2} 13027; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 13028; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13029; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 13030; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 13031; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 13032; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13033; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 13034; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 13035; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 13036; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 13037; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 13038; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 13039; AVX512BW-FCP-NEXT: kmovd %edi, %k2 13040; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} 13041; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 13042; AVX512BW-FCP-NEXT: kmovd %edi, %k1 13043; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} 13044; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 13045; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 13046; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 13047; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} 13048; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} 13049; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 13050; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13051; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 13052; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 13053; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 13054; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13055; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 13056; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13057; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 13058; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 13059; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 13060; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 13061; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} 13062; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 13063; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13064; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 13065; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} 13066; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} 13067; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 13068; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 13069; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 13070; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 13071; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13072; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 13073; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13074; AVX512BW-FCP-NEXT: movw $31, %di 13075; AVX512BW-FCP-NEXT: kmovd %edi, %k2 13076; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} 13077; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 13078; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 13079; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 13080; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 13081; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} 13082; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 13083; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13084; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 13085; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} 13086; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} 13087; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 13088; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 13089; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 13090; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 13091; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 13092; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} 13093; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 13094; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 13095; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 13096; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} 13097; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 13098; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 13099; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 13100; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} 13101; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} 13102; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 13103; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 13104; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 13105; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 13106; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) 13107; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) 13108; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) 13109; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) 13110; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 13111; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 13112; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 13113; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 13114; AVX512BW-FCP-NEXT: vzeroupper 13115; AVX512BW-FCP-NEXT: retq 13116; 13117; AVX512DQ-BW-LABEL: load_i16_stride6_vf64: 13118; AVX512DQ-BW: # %bb.0: 13119; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 13120; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3 13121; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 13122; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0 13123; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm5 13124; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm4 13125; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm6 13126; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 13127; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 13128; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 13129; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 13130; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm9 13131; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm11 13132; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 13133; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 13134; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 13135; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 13136; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 13137; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 13138; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 13139; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 13140; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 13141; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 13142; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 13143; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 13144; AVX512DQ-BW-NEXT: kmovd %edi, %k1 13145; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} 13146; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 13147; AVX512DQ-BW-NEXT: kmovd %edi, %k2 13148; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm8 {%k2} 13149; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm14 13150; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 13151; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 13152; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} 13153; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} 13154; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 13155; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 13156; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm17 13157; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 13158; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 13159; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13160; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 13161; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 13162; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 13163; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 13164; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 13165; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} 13166; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} 13167; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 13168; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm18 13169; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 13170; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm14 {%k1} 13171; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm14 {%k2} 13172; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 13173; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13174; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 13175; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 13176; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 13177; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13178; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 13179; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 13180; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 13181; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 13182; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 13183; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 13184; AVX512DQ-BW-NEXT: kmovd %edi, %k2 13185; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} 13186; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 13187; AVX512DQ-BW-NEXT: kmovd %edi, %k1 13188; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} 13189; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 13190; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 13191; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 13192; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} 13193; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} 13194; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 13195; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13196; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 13197; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 13198; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 13199; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13200; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 13201; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13202; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 13203; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 13204; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 13205; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 13206; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} 13207; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 13208; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13209; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 13210; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} 13211; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} 13212; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 13213; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 13214; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 13215; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 13216; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13217; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 13218; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13219; AVX512DQ-BW-NEXT: movw $31, %di 13220; AVX512DQ-BW-NEXT: kmovd %edi, %k2 13221; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} 13222; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 13223; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 13224; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 13225; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 13226; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} 13227; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 13228; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13229; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 13230; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} 13231; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} 13232; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 13233; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 13234; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 13235; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 13236; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 13237; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} 13238; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 13239; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 13240; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 13241; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} 13242; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 13243; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 13244; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 13245; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} 13246; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} 13247; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 13248; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) 13249; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) 13250; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rdx) 13251; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) 13252; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rcx) 13253; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r8) 13254; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) 13255; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r9) 13256; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9) 13257; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 13258; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) 13259; AVX512DQ-BW-NEXT: vzeroupper 13260; AVX512DQ-BW-NEXT: retq 13261; 13262; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf64: 13263; AVX512DQ-BW-FCP: # %bb.0: 13264; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13265; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 13266; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 13267; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 13268; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 13269; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 13270; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 13271; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 13272; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 13273; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 13274; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 13275; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 13276; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 13277; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] 13278; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 13279; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 13280; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 13281; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] 13282; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 13283; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 13284; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 13285; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] 13286; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 13287; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 13288; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 13289; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 13290; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} 13291; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 13292; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 13293; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k2} 13294; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm14 13295; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 13296; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 13297; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} 13298; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} 13299; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] 13300; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 13301; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 13302; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 13303; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] 13304; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13305; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 13306; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 13307; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] 13308; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 13309; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 13310; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} 13311; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} 13312; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 13313; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm18 13314; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 13315; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm14 {%k1} 13316; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm14 {%k2} 13317; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] 13318; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] 13319; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 13320; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 13321; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] 13322; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13323; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 13324; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 13325; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] 13326; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 13327; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 13328; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 13329; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 13330; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} 13331; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 13332; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 13333; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} 13334; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm18 13335; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 13336; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 13337; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} 13338; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} 13339; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] 13340; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] 13341; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 13342; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 13343; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] 13344; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13345; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 13346; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13347; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] 13348; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 13349; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 13350; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} 13351; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} 13352; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 13353; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13354; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 13355; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} 13356; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} 13357; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] 13358; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 13359; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 13360; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] 13361; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] 13362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 13363; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 13364; AVX512DQ-BW-FCP-NEXT: movw $31, %di 13365; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 13366; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} 13367; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] 13368; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] 13369; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 13370; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 13371; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} 13372; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 13373; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 13374; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 13375; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} 13376; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} 13377; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] 13378; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 13379; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] 13380; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] 13381; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 13382; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} 13383; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] 13384; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] 13385; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 13386; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} 13387; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 13388; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 13389; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 13390; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} 13391; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} 13392; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 13393; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 13394; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) 13395; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) 13396; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) 13397; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) 13398; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) 13399; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) 13400; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) 13401; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) 13402; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 13403; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) 13404; AVX512DQ-BW-FCP-NEXT: vzeroupper 13405; AVX512DQ-BW-FCP-NEXT: retq 13406 %wide.vec = load <384 x i16>, ptr %in.vec, align 64 13407 %strided.vec0 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378> 13408 %strided.vec1 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379> 13409 %strided.vec2 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380> 13410 %strided.vec3 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381> 13411 %strided.vec4 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382> 13412 %strided.vec5 = shufflevector <384 x i16> %wide.vec, <384 x i16> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383> 13413 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64 13414 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64 13415 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64 13416 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64 13417 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64 13418 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64 13419 ret void 13420} 13421